1. Data Visualization

2024-03-10

penguins
## # A tibble: 344 × 7
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 1 more variable: sex <fct>
glimpse(penguins)
## Rows: 344
## Columns: 7
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
?penguins
## Help on topic 'penguins' was found in the following packages:
## 
##   Package               Library
##   modeldata             /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##   palmerpenguins        /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## 
## 
## Using the first match ...
ggplot(
  data = penguins, # data = optional
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) + # mapping = optional, inside first () so passed globally
  geom_point(mapping = aes(color = species, shape = species)) + # define colors for just points
  geom_smooth(method = "lm") +
  labs(
    title = "Body mass and flipper length",
    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
    x = "Flipper length (mm)",
    y = "Body mass (g)",
    color = "Species",
    shape = "Species",
    caption = "Data come from the palmerpenguins package"
  ) +
  scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

dim(penguins) # 344 rows, 8 columns
## [1] 344   7
?penguins # a number denoting bill depth (millimeters) <- thickness
## Help on topic 'penguins' was found in the following packages:
## 
##   Package               Library
##   modeldata             /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##   palmerpenguins        /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## 
## 
## Using the first match ...
ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = bill_depth_mm)
) +
  geom_point() + # appearas positive
  labs(
    title = "Bill length and bill depth",
    x = "Bill length (mm)",
    y = "Bill depth (mm)"
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = species)
) +
  geom_point() +
  labs(
    title = "Bill length and species",
    x = "Bill length (mm)",
    y = "Species"
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = species)
) +
  geom_boxplot() +
  labs(
    title = "Bill length and species",
    x = "Bill length (mm)",
    y = "Species"
  )
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(
  data = penguins,
  mapping = aes(x = bill_length_mm, y = species)
) +
  geom_boxplot(na.rm = T) + # ignore missing values
  labs(
    title = "Bill length and species",
    x = "Bill length (mm)",
    y = "Species"
  )

colSums(is.na(penguins)) # get missing values
##           species            island    bill_length_mm     bill_depth_mm 
##                 0                 0                 2                 2 
## flipper_length_mm       body_mass_g               sex 
##                 2                 2                11
ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(mapping = aes(color = bill_depth_mm)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
) +
  geom_point() +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot() +
  geom_point(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  ) +
  geom_smooth(
    data = penguins,
    mapping = aes(x = flipper_length_mm, y = body_mass_g)
  )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Chapter 1

# verbose
ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# concise
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# future pipe
penguins |>
  ggplot(aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = species)) +
  geom_bar()

# reorder bt frequency by change to factor
ggplot(penguins, aes(x = fct_infreq(species))) +
  geom_bar()

# 1.4.2 A numerical variable
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 200)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

# play with bin width
ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 20)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 2000)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

# denskity plot
ggplot(penguins, aes(x = body_mass_g)) +
  geom_density()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(penguins, aes(y = species)) + # horizatonal when y =
  geom_bar()

ggplot(penguins, aes(x = species)) +
  geom_bar(color = "red") # outline red

ggplot(penguins, aes(x = species)) +
  geom_bar(fill = "red") # fill in red

# binwidth width of each bar in terms of x units

ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.05)

ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.50)

# boxplot
ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# density plot, customized the thickness of the lines using the linewidth argument in order to make them stand out a bit more against the background.
ggplot(penguins, aes(x = body_mass_g, color = species)) +
  geom_density(linewidth = 0.75)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

# alpha aesthetic to add transparency to the filled density curves.
ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
  geom_density(alpha = 0.5)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

# 1.5.2 Two categorical variables

# difficlut to interpet, different bar siaes
ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar()

#  relative frequency plot created by setting position = "fill" in the geom, is more useful for comparing species distributions across islands since it’s not affected by the unequal numbers of penguins across the islands
ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

# 1.5.3 Two numerical variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# 3 or more variables, butcluttered and difficult ot read
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# solutoin1 facets!
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = species)) +
  facet_wrap(~island) # seperate plots via island
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

mpg
## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # ℹ 224 more rows
?mpg

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty)
) +
  geom_point()

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty, color = cty)
) +
  geom_point()

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty, color = cty, shape = drv)
) +
  geom_point()

ggplot(
  mpg,
  aes(x = hwy, y = displ, size = cty, color = cty, shape = drv, linewidth = cty) # linewidth is ignored
) +
  geom_point()

ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) +
  geom_point(aes(color = species)) +
  facet_wrap(~species)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(
  data = penguins,
  mapping = aes(
    x = bill_length_mm, y = bill_depth_mm,
    color = species, shape = species
  )
) +
  geom_point() +
  labs(color = "species") # lower case to match var name
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill") # differences between islands

ggplot(penguins, aes(x = species, fill = island)) +
  geom_bar(position = "fill") # differences between species

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave(filename = "plots/penguin-plot.svg")
## Saving 8 x 5 in image
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(mpg, aes(x = class)) +
  geom_bar()

ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()

ggsave("plots/mpg-plot.pdf") # saves last run, remember to use f1 for help
## Saving 8 x 5 in image
my_variable <- 10
my_variable # typo! shouod be my_variable
## [1] 10
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

# option shift k for keyboard shortcuts

my_bar_plot <- ggplot(mpg, aes(x = class)) +
  geom_bar()
my_bar_plot

my_scatter_plot <- ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()
my_scatter_plot

ggsave(filename = "plots/mpg-plot.png", plot = my_bar_plot) # my bar plot is saved
## Saving 8 x 5 in image

Ch 3

if (!require("nycflights13")) install.packages("nycflights13")
## Loading required package: nycflights13
if (!require("Lahman")) install.packages("Lahman")
## Loading required package: Lahman
library(nycflights13)
library(tidyverse)
library(dplyr)
library(Lahman)

#  Conflicts with filter and lag. If you want to use the base version of these functions after loading dplyr, you’ll need to use their full names: stats::filter() and stats::lag().
?flights
flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
glimpse(flights) # <int> is short for integer, <dbl> is short for double (aka real numbers), <chr> for character (aka strings), and <dttm> for date-time.
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights |>
  filter(dest == "IAH") |>
  group_by(year, month, day) |>
  summarize(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day arr_delay
##    <int> <int> <int>     <dbl>
##  1  2013     1     1     17.8 
##  2  2013     1     2      7   
##  3  2013     1     3     18.3 
##  4  2013     1     4     -3.2 
##  5  2013     1     5     20.2 
##  6  2013     1     6      9.28
##  7  2013     1     7     -7.74
##  8  2013     1     8      7.79
##  9  2013     1     9     18.1 
## 10  2013     1    10      6.68
## # ℹ 355 more rows
# find all flights that departed more than 120 minutes (two hours) late:
flights |>
  filter(dep_delay > 120)
## # A tibble: 9,723 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848           1835       853     1001           1950
##  2  2013     1     1      957            733       144     1056            853
##  3  2013     1     1     1114            900       134     1447           1222
##  4  2013     1     1     1540           1338       122     2020           1825
##  5  2013     1     1     1815           1325       290     2120           1542
##  6  2013     1     1     1842           1422       260     1958           1535
##  7  2013     1     1     1856           1645       131     2212           2005
##  8  2013     1     1     1934           1725       129     2126           1855
##  9  2013     1     1     1938           1703       155     2109           1823
## 10  2013     1     1     1942           1705       157     2124           1830
## # ℹ 9,713 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Flights that departed on January 1
flights |>
  filter(month == 1 & day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Flights that departed in January or February
flights |>
  filter(month == 1 | month == 2)
## # A tibble: 51,955 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 51,945 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# A shorter way to select flights that departed in January or February
flights |>
  filter(month %in% c(1, 2))
## # A tibble: 51,955 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 51,945 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  filter(month %in% c(11, 12))
## # A tibble: 55,403 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    11     1        5           2359         6      352            345
##  2  2013    11     1       35           2250       105      123           2356
##  3  2013    11     1      455            500        -5      641            651
##  4  2013    11     1      539            545        -6      856            827
##  5  2013    11     1      542            545        -3      831            855
##  6  2013    11     1      549            600       -11      912            923
##  7  2013    11     1      550            600       -10      705            659
##  8  2013    11     1      554            600        -6      659            701
##  9  2013    11     1      554            600        -6      826            827
## 10  2013    11     1      554            600        -6      749            751
## # ℹ 55,393 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
feb <- flights |>
  filter(month == 2)
feb
## # A tibble: 24,951 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     2     1      456            500        -4      652            648
##  2  2013     2     1      520            525        -5      816            820
##  3  2013     2     1      527            530        -3      837            829
##  4  2013     2     1      532            540        -8     1007           1017
##  5  2013     2     1      540            540         0      859            850
##  6  2013     2     1      552            600        -8      714            715
##  7  2013     2     1      552            600        -8      919            910
##  8  2013     2     1      552            600        -8      655            709
##  9  2013     2     1      553            600        -7      833            815
## 10  2013     2     1      553            600        -7      821            825
## # ℹ 24,941 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  arrange(year, month, day, dep_delay, dep_time)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      940            955       -15     1226           1220
##  2  2013     1     1     2030           2045       -15     2150           2225
##  3  2013     1     1     1716           1730       -14     1947           1953
##  4  2013     1     1      946            959       -13     1146           1202
##  5  2013     1     1     2217           2229       -12      249            315
##  6  2013     1     1      839            850       -11     1027           1035
##  7  2013     1     1     1849           1900       -11     2131           2129
##  8  2013     1     1      800            810       -10      949            955
##  9  2013     1     1      805            815       -10     1006           1010
## 10  2013     1     1      820            830       -10      940            954
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  arrange(desc(dep_delay))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# 3.2.4 distinct()
flights |>
  distinct() # remove any duplicate rows
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Find all unique origin and destination pairs
flights |>
  distinct(origin, dest)
## # A tibble: 224 × 2
##    origin dest 
##    <chr>  <chr>
##  1 EWR    IAH  
##  2 LGA    IAH  
##  3 JFK    MIA  
##  4 JFK    BQN  
##  5 LGA    ATL  
##  6 EWR    ORD  
##  7 EWR    FLL  
##  8 LGA    IAD  
##  9 JFK    MCO  
## 10 LGA    ORD  
## # ℹ 214 more rows
# eep other columns when filtering for unique rows, you can use the .keep_all = TRUE option.
flights |>
  distinct(origin, dest, .keep_all = TRUE)
## # A tibble: 224 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 214 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  count(origin, dest, sort = TRUE)
## # A tibble: 224 × 3
##    origin dest      n
##    <chr>  <chr> <int>
##  1 JFK    LAX   11262
##  2 LGA    ATL   10263
##  3 LGA    ORD    8857
##  4 JFK    SFO    8204
##  5 LGA    CLT    6168
##  6 EWR    ORD    6100
##  7 JFK    BOS    5898
##  8 LGA    MIA    5781
##  9 JFK    MCO    5464
## 10 EWR    BOS    5327
## # ℹ 214 more rows
flights |>
  count(year, month, sort = TRUE)
## # A tibble: 12 × 3
##     year month     n
##    <int> <int> <int>
##  1  2013     7 29425
##  2  2013     8 29327
##  3  2013    10 28889
##  4  2013     3 28834
##  5  2013     5 28796
##  6  2013     4 28330
##  7  2013     6 28243
##  8  2013    12 28135
##  9  2013     9 27574
## 10  2013    11 27268
## 11  2013     1 27004
## 12  2013     2 24951
flights |>
  filter(dep_delay >= 120)
## # A tibble: 9,888 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848           1835       853     1001           1950
##  2  2013     1     1      957            733       144     1056            853
##  3  2013     1     1     1114            900       134     1447           1222
##  4  2013     1     1     1540           1338       122     2020           1825
##  5  2013     1     1     1815           1325       290     2120           1542
##  6  2013     1     1     1842           1422       260     1958           1535
##  7  2013     1     1     1856           1645       131     2212           2005
##  8  2013     1     1     1934           1725       129     2126           1855
##  9  2013     1     1     1938           1703       155     2109           1823
## 10  2013     1     1     1942           1705       157     2124           1830
## # ℹ 9,878 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  filter(dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
## # ℹ 9,303 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  filter(carrier %in% c("UA", "AA", "DL"))
## # A tibble: 139,504 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            600        -1      941            910
## 10  2013     1     1      559            600        -1      854            902
## # ℹ 139,494 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  filter(month %in% c(6, 7, 8))
## # A tibble: 86,995 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     6     1        2           2359         3      341            350
##  2  2013     6     1      451            500        -9      624            640
##  3  2013     6     1      506            515        -9      715            800
##  4  2013     6     1      534            545       -11      800            829
##  5  2013     6     1      538            545        -7      925            922
##  6  2013     6     1      539            540        -1      832            840
##  7  2013     6     1      546            600       -14      850            910
##  8  2013     6     1      551            600        -9      828            850
##  9  2013     6     1      552            600        -8      647            655
## 10  2013     6     1      553            600        -7      700            711
## # ℹ 86,985 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  filter(arr_delay > 120 & dep_delay == 0)
## # A tibble: 3 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013    10     7     1350           1350         0     1736           1526
## 2  2013     5    23     1810           1810         0     2208           2000
## 3  2013     7     1      905            905         0     1443           1223
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  arrange(desc(dep_delay))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  arrange(dep_time)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    13        1           2249        72      108           2357
##  2  2013     1    31        1           2100       181      124           2225
##  3  2013    11    13        1           2359         2      442            440
##  4  2013    12    16        1           2359         2      447            437
##  5  2013    12    20        1           2359         2      430            440
##  6  2013    12    26        1           2359         2      437            440
##  7  2013    12    30        1           2359         2      441            437
##  8  2013     2    11        1           2100       181      111           2225
##  9  2013     2    24        1           2245        76      121           2354
## 10  2013     3     8        1           2355         6      431            440
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  arrange(arr_time - dep_time) |>
  relocate(dep_time, arr_time)
## # A tibble: 336,776 × 19
##    dep_time arr_time  year month   day sched_dep_time dep_delay sched_arr_time
##       <int>    <int> <int> <int> <int>          <int>     <dbl>          <int>
##  1     2400       54  2013     7    17           2142       138           2259
##  2     2400       59  2013    12     9           2250        70           2356
##  3     2338       17  2013     6    12           2129       129           2235
##  4     2332       14  2013    12    29           2155        97           2300
##  5     2335       18  2013    11     6           2215        80           2317
##  6     2347       30  2013     2    25           2145       122           2239
##  7     2351       35  2013     8    13           2152       119           2258
##  8     2342       27  2013    10    11           2030       192           2205
##  9     2356       41  2013     2    26           2000       236           2104
## 10     2342       28  2013     1    24           2159       103           2300
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  distinct(month, day) |>
  arrange(month, day) |>
  count() #  OR nrow()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   365
flights |>
  arrange(desc(distance)) |>
  relocate(distance, origin, dest) # JFK to HNL
## # A tibble: 336,776 × 19
##    distance origin dest   year month   day dep_time sched_dep_time dep_delay
##       <dbl> <chr>  <chr> <int> <int> <int>    <int>          <int>     <dbl>
##  1     4983 JFK    HNL    2013     1     1      857            900        -3
##  2     4983 JFK    HNL    2013     1     2      909            900         9
##  3     4983 JFK    HNL    2013     1     3      914            900        14
##  4     4983 JFK    HNL    2013     1     4      900            900         0
##  5     4983 JFK    HNL    2013     1     5      858            900        -2
##  6     4983 JFK    HNL    2013     1     6     1019            900        79
##  7     4983 JFK    HNL    2013     1     7     1042            900       102
##  8     4983 JFK    HNL    2013     1     8      901            900         1
##  9     4983 JFK    HNL    2013     1     9      641            900      1301
## 10     4983 JFK    HNL    2013     1    10      859            900        -1
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, air_time <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
flights |>
  arrange(distance) |>
  relocate(distance, origin, dest) # EWR to PHL
## # A tibble: 336,776 × 19
##    distance origin dest   year month   day dep_time sched_dep_time dep_delay
##       <dbl> <chr>  <chr> <int> <int> <int>    <int>          <int>     <dbl>
##  1       17 EWR    LGA    2013     7    27       NA            106        NA
##  2       80 EWR    PHL    2013     1     3     2127           2129        -2
##  3       80 EWR    PHL    2013     1     4     1240           1200        40
##  4       80 EWR    PHL    2013     1     4     1829           1615       134
##  5       80 EWR    PHL    2013     1     4     2128           2129        -1
##  6       80 EWR    PHL    2013     1     5     1155           1200        -5
##  7       80 EWR    PHL    2013     1     6     2125           2129        -4
##  8       80 EWR    PHL    2013     1     7     2124           2129        -5
##  9       80 EWR    PHL    2013     1     8     2127           2130        -3
## 10       80 EWR    PHL    2013     1     9     2126           2129        -3
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, air_time <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
# Order does not matter
# mutate
flights |>
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60
  ) |>
  relocate(gain, speed)
## # A tibble: 336,776 × 21
##     gain speed  year month   day dep_time sched_dep_time dep_delay arr_time
##    <dbl> <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1    -9  370.  2013     1     1      517            515         2      830
##  2   -16  374.  2013     1     1      533            529         4      850
##  3   -31  408.  2013     1     1      542            540         2      923
##  4    17  517.  2013     1     1      544            545        -1     1004
##  5    19  394.  2013     1     1      554            600        -6      812
##  6   -16  288.  2013     1     1      554            558        -4      740
##  7   -24  404.  2013     1     1      555            600        -5      913
##  8    11  259.  2013     1     1      557            600        -3      709
##  9     5  405.  2013     1     1      557            600        -3      838
## 10   -10  319.  2013     1     1      558            600        -2      753
## # ℹ 336,766 more rows
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# add to left hand side after mutating
# mutate
flights |>
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60,
    before = 1
  ) # equivalent to relocate gain, speed
## # A tibble: 336,776 × 22
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 14 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, gain <dbl>, speed <dbl>,
## #   before <dbl>
# add after day variable, . is a sign that .before is an argument to the mutate function, not a variable name
flights |>
  mutate(
    gain = dep_delay - arr_delay,
    speed = distance / air_time * 60,
    .after = day
  )
## # A tibble: 336,776 × 21
##     year month   day  gain speed dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <dbl> <dbl>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1    -9  370.      517            515         2      830
##  2  2013     1     1   -16  374.      533            529         4      850
##  3  2013     1     1   -31  408.      542            540         2      923
##  4  2013     1     1    17  517.      544            545        -1     1004
##  5  2013     1     1    19  394.      554            600        -6      812
##  6  2013     1     1   -16  288.      554            558        -4      740
##  7  2013     1     1   -24  404.      555            600        -5      913
##  8  2013     1     1    11  259.      557            600        -3      709
##  9  2013     1     1     5  405.      557            600        -3      838
## 10  2013     1     1   -10  319.      558            600        -2      753
## # ℹ 336,766 more rows
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# only keep affected variables
df_delay_gain <- flights |>
  mutate(
    gain = dep_delay - arr_delay,
    hours = air_time / 60,
    gain_per_hour = gain / hours,
    .keep = "used"
  )

df_delay_gain
## # A tibble: 336,776 × 6
##    dep_delay arr_delay air_time  gain hours gain_per_hour
##        <dbl>     <dbl>    <dbl> <dbl> <dbl>         <dbl>
##  1         2        11      227    -9 3.78          -2.38
##  2         4        20      227   -16 3.78          -4.23
##  3         2        33      160   -31 2.67         -11.6 
##  4        -1       -18      183    17 3.05           5.57
##  5        -6       -25      116    19 1.93           9.83
##  6        -4        12      150   -16 2.5           -6.4 
##  7        -5        19      158   -24 2.63          -9.11
##  8        -3       -14       53    11 0.883         12.5 
##  9        -3        -8      140     5 2.33           2.14
## 10        -2         8      138   -10 2.3           -4.35
## # ℹ 336,766 more rows
# 3.3.2 select()

# selet columns by name
flights |>
  select(year, month, day, carrier)
## # A tibble: 336,776 × 4
##     year month   day carrier
##    <int> <int> <int> <chr>  
##  1  2013     1     1 UA     
##  2  2013     1     1 UA     
##  3  2013     1     1 AA     
##  4  2013     1     1 B6     
##  5  2013     1     1 DL     
##  6  2013     1     1 UA     
##  7  2013     1     1 B6     
##  8  2013     1     1 EV     
##  9  2013     1     1 B6     
## 10  2013     1     1 AA     
## # ℹ 336,766 more rows
# select all columns between variables
flights |>
  select(flight:dest)
## # A tibble: 336,776 × 4
##    flight tailnum origin dest 
##     <int> <chr>   <chr>  <chr>
##  1   1545 N14228  EWR    IAH  
##  2   1714 N24211  LGA    IAH  
##  3   1141 N619AA  JFK    MIA  
##  4    725 N804JB  JFK    BQN  
##  5    461 N668DN  LGA    ATL  
##  6   1696 N39463  EWR    ORD  
##  7    507 N516JB  EWR    FLL  
##  8   5708 N829AS  LGA    IAD  
##  9     79 N593JB  JFK    MCO  
## 10    301 N3ALAA  LGA    ORD  
## # ℹ 336,766 more rows
# select all columns except variables
flights |>
  select(!year:day) # Historically this operation was done with - instead of !, so you’re likely to see that in the wild. These two operators serve the same purpose but with subtle differences in behavior. We recommend using ! because it reads as “not” and combines well with & and |.
## # A tibble: 336,776 × 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
##       <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
##  1      517            515         2      830            819        11 UA     
##  2      533            529         4      850            830        20 UA     
##  3      542            540         2      923            850        33 AA     
##  4      544            545        -1     1004           1022       -18 B6     
##  5      554            600        -6      812            837       -25 DL     
##  6      554            558        -4      740            728        12 UA     
##  7      555            600        -5      913            854        19 B6     
##  8      557            600        -3      709            723       -14 EV     
##  9      557            600        -3      838            846        -8 B6     
## 10      558            600        -2      753            745         8 AA     
## # ℹ 336,766 more rows
## # ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# select all columns with characters
flights |>
  select(where(is.character))
## # A tibble: 336,776 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD  
## # ℹ 336,766 more rows
# rename variables
flights |>
  select(tail_num = tailnum) # new name on left, old on right
## # A tibble: 336,776 × 1
##    tail_num
##    <chr>   
##  1 N14228  
##  2 N24211  
##  3 N619AA  
##  4 N804JB  
##  5 N668DN  
##  6 N39463  
##  7 N516JB  
##  8 N829AS  
##  9 N593JB  
## 10 N3ALAA  
## # ℹ 336,766 more rows
# further oin relocation
flights |>
  relocate(year:dep_time, .after = time_hour)
## # A tibble: 336,776 × 19
##    sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight
##             <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int>
##  1            515         2      830            819        11 UA        1545
##  2            529         4      850            830        20 UA        1714
##  3            540         2      923            850        33 AA        1141
##  4            545        -1     1004           1022       -18 B6         725
##  5            600        -6      812            837       -25 DL         461
##  6            558        -4      740            728        12 UA        1696
##  7            600        -5      913            854        19 B6         507
##  8            600        -3      709            723       -14 EV        5708
##  9            600        -3      838            846        -8 B6          79
## 10            600        -2      753            745         8 AA         301
## # ℹ 336,766 more rows
## # ℹ 12 more variables: tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, year <int>,
## #   month <int>, day <int>, dep_time <int>
flights |>
  relocate(starts_with("arr"), .before = dep_time)
## # A tibble: 336,776 × 19
##     year month   day arr_time arr_delay dep_time sched_dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>    <int>          <int>     <dbl>
##  1  2013     1     1      830        11      517            515         2
##  2  2013     1     1      850        20      533            529         4
##  3  2013     1     1      923        33      542            540         2
##  4  2013     1     1     1004       -18      544            545        -1
##  5  2013     1     1      812       -25      554            600        -6
##  6  2013     1     1      740        12      554            558        -4
##  7  2013     1     1      913        19      555            600        -5
##  8  2013     1     1      709       -14      557            600        -3
##  9  2013     1     1      838        -8      557            600        -3
## 10  2013     1     1      753         8      558            600        -2
## # ℹ 336,766 more rows
## # ℹ 11 more variables: sched_arr_time <int>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  select(dep_time, sched_dep_time, dep_delay) # dep_delay diff between dep_time and sched
## # A tibble: 336,776 × 3
##    dep_time sched_dep_time dep_delay
##       <int>          <int>     <dbl>
##  1      517            515         2
##  2      533            529         4
##  3      542            540         2
##  4      544            545        -1
##  5      554            600        -6
##  6      554            558        -4
##  7      555            600        -5
##  8      557            600        -3
##  9      557            600        -3
## 10      558            600        -2
## # ℹ 336,766 more rows
flights |>
  select(dep_time:dep_delay)
## # A tibble: 336,776 × 3
##    dep_time sched_dep_time dep_delay
##       <int>          <int>     <dbl>
##  1      517            515         2
##  2      533            529         4
##  3      542            540         2
##  4      544            545        -1
##  5      554            600        -6
##  6      554            558        -4
##  7      555            600        -5
##  8      557            600        -3
##  9      557            600        -3
## 10      558            600        -2
## # ℹ 336,766 more rows
flights |>
  select(starts_with("dep") | starts_with("arr"))
## # A tibble: 336,776 × 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ℹ 336,766 more rows
flights |>
  select(month, month, month)
## # A tibble: 336,776 × 1
##    month
##    <int>
##  1     1
##  2     1
##  3     1
##  4     1
##  5     1
##  6     1
##  7     1
##  8     1
##  9     1
## 10     1
## # ℹ 336,766 more rows
variables <- c("year", "month", "day", "dep_delay", "arr_delay")

flights |>
  select(any_of(variables))
## # A tibble: 336,776 × 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # ℹ 336,766 more rows
flights |>
  select(contains("TIME")) # by default, case ignored
## # A tibble: 336,776 × 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights |>
  rename(air_time_min = air_time) |>
  relocate(air_time_min, .before = 1)
## # A tibble: 336,776 × 19
##    air_time_min  year month   day dep_time sched_dep_time dep_delay arr_time
##           <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1          227  2013     1     1      517            515         2      830
##  2          227  2013     1     1      533            529         4      850
##  3          160  2013     1     1      542            540         2      923
##  4          183  2013     1     1      544            545        -1     1004
##  5          116  2013     1     1      554            600        -6      812
##  6          150  2013     1     1      554            558        -4      740
##  7          158  2013     1     1      555            600        -5      913
##  8           53  2013     1     1      557            600        -3      709
##  9          140  2013     1     1      557            600        -3      838
## 10          138  2013     1     1      558            600        -2      753
## # ℹ 336,766 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  select(tailnum)
## # A tibble: 336,776 × 1
##    tailnum
##    <chr>  
##  1 N14228 
##  2 N24211 
##  3 N619AA 
##  4 N804JB 
##  5 N668DN 
##  6 N39463 
##  7 N516JB 
##  8 N829AS 
##  9 N593JB 
## 10 N3ALAA 
## # ℹ 336,766 more rows
# arrange(arr_delay) # because tailnum was selected, so arr_delay was booted
flights |>
  filter(dest == "IAH") |>
  mutate(speed = distance / air_time * 60) |>
  select(year:day, dep_time, carrier, flight, speed) |>
  arrange(desc(speed))
## # A tibble: 7,198 × 7
##     year month   day dep_time carrier flight speed
##    <int> <int> <int>    <int> <chr>    <int> <dbl>
##  1  2013     7     9      707 UA         226  522.
##  2  2013     8    27     1850 UA        1128  521.
##  3  2013     8    28      902 UA        1711  519.
##  4  2013     8    28     2122 UA        1022  519.
##  5  2013     6    11     1628 UA        1178  515.
##  6  2013     8    27     1017 UA         333  515.
##  7  2013     8    27     1205 UA        1421  515.
##  8  2013     8    27     1758 UA         302  515.
##  9  2013     9    27      521 UA         252  515.
## 10  2013     8    28      625 UA         559  515.
## # ℹ 7,188 more rows
flights |>
  group_by(month)
## # A tibble: 336,776 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# 3.5.2 summarize()
flights |>
  group_by(month) |>
  summarize(
    avg_delay = mean(dep_delay) # uh oh, missing data!
  )
## # A tibble: 12 × 2
##    month avg_delay
##    <int>     <dbl>
##  1     1        NA
##  2     2        NA
##  3     3        NA
##  4     4        NA
##  5     5        NA
##  6     6        NA
##  7     7        NA
##  8     8        NA
##  9     9        NA
## 10    10        NA
## 11    11        NA
## 12    12        NA
flights |>
  group_by(month) |>
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE)
  )
## # A tibble: 12 × 2
##    month avg_delay
##    <int>     <dbl>
##  1     1     10.0 
##  2     2     10.8 
##  3     3     13.2 
##  4     4     13.9 
##  5     5     13.0 
##  6     6     20.8 
##  7     7     21.7 
##  8     8     12.6 
##  9     9      6.72
## 10    10      6.24
## 11    11      5.44
## 12    12     16.6
flights |>
  group_by(month) |>
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE),
    n = n() # number of rows in each group
  )
## # A tibble: 12 × 3
##    month avg_delay     n
##    <int>     <dbl> <int>
##  1     1     10.0  27004
##  2     2     10.8  24951
##  3     3     13.2  28834
##  4     4     13.9  28330
##  5     5     13.0  28796
##  6     6     20.8  28243
##  7     7     21.7  29425
##  8     8     12.6  29327
##  9     9      6.72 27574
## 10    10      6.24 28889
## 11    11      5.44 27268
## 12    12     16.6  28135
# 3.5.3 The slice_ functions

# df |> slice_head(n = 1) takes the first row from each group.
# df |> slice_tail(n = 1) takes the last row in each group.
# df |> slice_min(x, n = 1) takes the row with the smallest value of column x.
# df |> slice_max(x, n = 1) takes the row with the largest value of column x.
# df |> slice_sample(n = 1) takes one random row.

flights |>
  group_by(dest) |>
  slice_max(arr_delay, n = 1) |> # takes the row with the largest value of column arr_delay
  relocate(dest) # Note that there are 105 destinations but we get 108 rows here. What’s up? slice_min() and slice_max() keep tied values so n = 1 means give us all rows with the highest value. If you want exactly one row per group you can set with_ties = FALSE.
## # A tibble: 108 × 19
## # Groups:   dest [105]
##    dest   year month   day dep_time sched_dep_time dep_delay arr_time
##    <chr> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 ABQ    2013     7    22     2145           2007        98      132
##  2 ACK    2013     7    23     1139            800       219     1250
##  3 ALB    2013     1    25      123           2000       323      229
##  4 ANC    2013     8    17     1740           1625        75     2042
##  5 ATL    2013     7    22     2257            759       898      121
##  6 AUS    2013     7    10     2056           1505       351     2347
##  7 AVL    2013     8    13     1156            832       204     1417
##  8 BDL    2013     2    21     1728           1316       252     1839
##  9 BGR    2013    12     1     1504           1056       248     1628
## 10 BHM    2013     4    10       25           1900       325      136
## # ℹ 98 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights |>
  group_by(dest) |>
  slice_max(arr_delay, n = 1, with_ties = F) |> # no ties, only one row per group
  relocate(dest)
## # A tibble: 105 × 19
## # Groups:   dest [105]
##    dest   year month   day dep_time sched_dep_time dep_delay arr_time
##    <chr> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 ABQ    2013     7    22     2145           2007        98      132
##  2 ACK    2013     7    23     1139            800       219     1250
##  3 ALB    2013     1    25      123           2000       323      229
##  4 ANC    2013     8    17     1740           1625        75     2042
##  5 ATL    2013     7    22     2257            759       898      121
##  6 AUS    2013     7    10     2056           1505       351     2347
##  7 AVL    2013     8    13     1156            832       204     1417
##  8 BDL    2013     2    21     1728           1316       252     1839
##  9 BGR    2013    12     1     1504           1056       248     1628
## 10 BHM    2013     4    10       25           1900       325      136
## # ℹ 95 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# This is similar to computing the max delay with summarize(), but you get the whole corresponding row (or rows if there’s a tie) instead of the single summary statistic.

daily <- flights |>
  group_by(year, month, day) # multiple group variables, each summary peels off the last group.
daily
## # A tibble: 336,776 × 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
daily_flights <- daily |>
  summarize(n = n()) # explains how one group was peeled off (day), now only 2 left
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
daily_flights <- daily |>
  summarize(
    n = n(),
    .groups = "drop_last" # surpress warning about peeling offf groups
  )

# summarize ungrouped data
daily |>
  ungroup() |>
  summarize( # only one row becuase ungrouped data is treated as one group
    avg_delay = mean(dep_delay, na.rm = TRUE),
    flights = n()
  )
## # A tibble: 1 × 2
##   avg_delay flights
##       <dbl>   <int>
## 1      12.6  336776
# 3.5.6 .by

flights |>
  summarize(
    delay = mean(dep_delay, na.rm = TRUE),
    n = n(),
    .by = month # new and experimental
  )
## # A tibble: 12 × 3
##    month delay     n
##    <int> <dbl> <int>
##  1     1 10.0  27004
##  2    10  6.24 28889
##  3    11  5.44 27268
##  4    12 16.6  28135
##  5     2 10.8  24951
##  6     3 13.2  28834
##  7     4 13.9  28330
##  8     5 13.0  28796
##  9     6 20.8  28243
## 10     7 21.7  29425
## 11     8 12.6  29327
## 12     9  6.72 27574
# group by multiple variables
flights |>
  summarize(
    delay = mean(dep_delay, na.rm = TRUE),
    n = n(),
    .by = c(origin, dest) # .by works with all verbs and has the advantage that you don’t need to use the .groups argument to suppress the grouping message or ungroup() when you’re done.
  )
## # A tibble: 224 × 4
##    origin dest  delay     n
##    <chr>  <chr> <dbl> <int>
##  1 EWR    IAH   11.8   3973
##  2 LGA    IAH    9.06  2951
##  3 JFK    MIA    9.34  3314
##  4 JFK    BQN    6.67   599
##  5 LGA    ATL   11.4  10263
##  6 EWR    ORD   14.6   6100
##  7 EWR    FLL   13.5   3793
##  8 LGA    IAD   16.7   1803
##  9 JFK    MCO   10.6   5464
## 10 LGA    ORD   10.7   8857
## # ℹ 214 more rows
flights |>
  group_by(carrier, dest) |>
  summarize(n())
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
## # A tibble: 314 × 3
## # Groups:   carrier [16]
##    carrier dest  `n()`
##    <chr>   <chr> <int>
##  1 9E      ATL      59
##  2 9E      AUS       2
##  3 9E      AVL      10
##  4 9E      BGR       1
##  5 9E      BNA     474
##  6 9E      BOS     914
##  7 9E      BTV       2
##  8 9E      BUF     833
##  9 9E      BWI     856
## 10 9E      CAE       3
## # ℹ 304 more rows
flights |>
  group_by(carrier) |>
  summarize(
    avg_dep_delay = mean(dep_delay, na.rm = T)
  ) |>
  arrange(desc(avg_dep_delay)) # F9 (Frontier Airlines) has the worst delays
## # A tibble: 16 × 2
##    carrier avg_dep_delay
##    <chr>           <dbl>
##  1 F9              20.2 
##  2 EV              20.0 
##  3 YV              19.0 
##  4 FL              18.7 
##  5 WN              17.7 
##  6 9E              16.7 
##  7 B6              13.0 
##  8 VX              12.9 
##  9 OO              12.6 
## 10 UA              12.1 
## 11 MQ              10.6 
## 12 DL               9.26
## 13 AA               8.59
## 14 AS               5.80
## 15 HA               4.90
## 16 US               3.78
flights |>
  group_by(dest) |>
  slice_max(dep_delay, with_ties = F) |>
  relocate(dest, dep_delay)
## # A tibble: 105 × 19
## # Groups:   dest [105]
##    dest  dep_delay  year month   day dep_time sched_dep_time arr_time
##    <chr>     <dbl> <int> <int> <int>    <int>          <int>    <int>
##  1 ABQ         142  2013    12    14     2223           2001      133
##  2 ACK         219  2013     7    23     1139            800     1250
##  3 ALB         323  2013     1    25      123           2000      229
##  4 ANC          75  2013     8    17     1740           1625     2042
##  5 ATL         898  2013     7    22     2257            759      121
##  6 AUS         351  2013     7    10     2056           1505     2347
##  7 AVL         222  2013     6    14     1158            816     1335
##  8 BDL         252  2013     2    21     1728           1316     1839
##  9 BGR         248  2013    12     1     1504           1056     1628
## 10 BHM         325  2013     4    10       25           1900      136
## # ℹ 95 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
df_dep_time <- flights |>
  group_by(hour) |>
  summarize(
    avg_delay_time = mean(dep_delay, na.rm = T)
  )

ggplot(df_dep_time, aes(x = hour, y = avg_delay_time)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

# flights |>
#   group_by(dest)
#   slice_min(dep_delay, n = -1)

flights |>
  count(carrier, sort = T) # roughly equivalent to df %>% group_by(a, b) %>% summarise(n = n()). count()
## # A tibble: 16 × 2
##    carrier     n
##    <chr>   <int>
##  1 UA      58665
##  2 B6      54635
##  3 EV      54173
##  4 DL      48110
##  5 AA      32729
##  6 MQ      26397
##  7 US      20536
##  8 9E      18460
##  9 WN      12275
## 10 VX       5162
## 11 FL       3260
## 12 AS        714
## 13 F9        685
## 14 YV        601
## 15 HA        342
## 16 OO         32
df <- tibble(
  x = 1:5,
  y = c("a", "b", "a", "a", "b"),
  z = c("K", "K", "L", "L", "K")
)

df
## # A tibble: 5 × 3
##       x y     z    
##   <int> <chr> <chr>
## 1     1 a     K    
## 2     2 b     K    
## 3     3 a     L    
## 4     4 a     L    
## 5     5 b     K
# a, a, a, a, b

df |>
  group_by(y) # groups by y , thought output remains the same
## # A tibble: 5 × 3
## # Groups:   y [2]
##       x y     z    
##   <int> <chr> <chr>
## 1     1 a     K    
## 2     2 b     K    
## 3     3 a     L    
## 4     4 a     L    
## 5     5 b     K
# a, a, a, a, b

df |>
  arrange(y)
## # A tibble: 5 × 3
##       x y     z    
##   <int> <chr> <chr>
## 1     1 a     K    
## 2     3 a     L    
## 3     4 a     L    
## 4     2 b     K    
## 5     5 b     K
# two rows, mean of all a's, mean of all b's, two columns

df |>
  group_by(y) |>
  summarize(mean_x = mean(x))
## # A tibble: 2 × 2
##   y     mean_x
##   <chr>  <dbl>
## 1 a       2.67
## 2 b       3.5
# group by all combinatoins of y z, so a K, a L, b K
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))
## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups:   y [2]
##   y     z     mean_x
##   <chr> <chr>  <dbl>
## 1 a     K        1  
## 2 a     L        3.5
## 3 b     K        3.5
# same\?
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x), .groups = "drop") # result is not grouped
## # A tibble: 3 × 3
##   y     z     mean_x
##   <chr> <chr>  <dbl>
## 1 a     K        1  
## 2 a     L        3.5
## 3 b     K        3.5
df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x)) # show y z and summary mnean_x
## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups:   y [2]
##   y     z     mean_x
##   <chr> <chr>  <dbl>
## 1 a     K        1  
## 2 a     L        3.5
## 3 b     K        3.5
df |>
  group_by(y, z) |>
  mutate(mean_x = mean(x)) # create new column plus all others
## # A tibble: 5 × 4
## # Groups:   y, z [3]
##       x y     z     mean_x
##   <int> <chr> <chr>  <dbl>
## 1     1 a     K        1  
## 2     2 b     K        3.5
## 3     3 a     L        3.5
## 4     4 a     L        3.5
## 5     5 b     K        3.5
batters <- Lahman::Batting |>
  group_by(playerID) |>
  summarize(
    performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    n = sum(AB, na.rm = TRUE)
  )
batters
## # A tibble: 20,469 × 3
##    playerID  performance     n
##    <chr>           <dbl> <int>
##  1 aardsda01      0          4
##  2 aaronha01      0.305  12364
##  3 aaronto01      0.229    944
##  4 aasedo01       0          5
##  5 abadan01       0.0952    21
##  6 abadfe01       0.111      9
##  7 abadijo01      0.224     49
##  8 abbated01      0.254   3044
##  9 abbeybe01      0.169    225
## 10 abbeych01      0.281   1756
## # ℹ 20,459 more rows
batters |>
  filter(n > 100) |>
  ggplot(aes(x = n, y = performance)) +
  geom_point(alpha = 1 / 10) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

batters |>
  arrange(desc(performance))
## # A tibble: 20,469 × 3
##    playerID  performance     n
##    <chr>           <dbl> <int>
##  1 abramge01           1     1
##  2 alberan01           1     1
##  3 banisje01           1     1
##  4 bartocl01           1     1
##  5 bassdo01            1     1
##  6 birasst01           1     2
##  7 bruneju01           1     1
##  8 burnscb01           1     1
##  9 cammaer01           1     1
## 10 campsh01            1     1
## # ℹ 20,459 more rows

Ch 4

library(nycflights13)
flights %>%
  filter(dest == "IAH") %>%
  group_by(year, month, day) %>%
  summarize(
    n = n(),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(n > 10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day     n delay
##    <int> <int> <int> <int> <dbl>
##  1  2013     1     1    20 17.8 
##  2  2013     1     2    20  7   
##  3  2013     1     3    19 18.3 
##  4  2013     1     4    20 -3.2 
##  5  2013     1     5    13 20.2 
##  6  2013     1     6    18  9.28
##  7  2013     1     7    19 -7.74
##  8  2013     1     8    19  7.79
##  9  2013     1     9    19 18.1 
## 10  2013     1    10    19  6.68
## # ℹ 355 more rows
flights %>%
  filter(
    carrier == "UA",
    dest %in% c("IAH", "HOU"),
    sched_dep_time > 900,
    sched_arr_time < 2000
  ) %>%
  group_by(flight) %>%
  summarize(
    delay = mean(arr_delay, na.rm = TRUE),
    cancelled = sum(is.na(arr_delay)),
    n = n()
  ) %>%
  filter(n > 10)
## # A tibble: 74 × 4
##    flight delay cancelled     n
##     <int> <dbl>     <int> <int>
##  1     53 12.5          2    18
##  2    112 14.1          0    14
##  3    205 -1.71         0    14
##  4    235 -5.36         0    14
##  5    255 -9.47         0    15
##  6    268 38.6          1    15
##  7    292  6.57         0    21
##  8    318 10.7          1    20
##  9    337 20.1          2    21
## 10    370 17.5          0    11
## # ℹ 64 more rows

Ch 5

billboard
## # A tibble: 317 × 79
##    artist     track date.entered   wk1   wk2   wk3   wk4   wk5   wk6   wk7   wk8
##    <chr>      <chr> <date>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 2 Pac      Baby… 2000-02-26      87    82    72    77    87    94    99    NA
##  2 2Ge+her    The … 2000-09-02      91    87    92    NA    NA    NA    NA    NA
##  3 3 Doors D… Kryp… 2000-04-08      81    70    68    67    66    57    54    53
##  4 3 Doors D… Loser 2000-10-21      76    76    72    69    67    65    55    59
##  5 504 Boyz   Wobb… 2000-04-15      57    34    25    17    17    31    36    49
##  6 98^0       Give… 2000-08-19      51    39    34    26    26    19     2     2
##  7 A*Teens    Danc… 2000-07-08      97    97    96    95   100    NA    NA    NA
##  8 Aaliyah    I Do… 2000-01-29      84    62    51    41    38    35    35    38
##  9 Aaliyah    Try … 2000-03-18      59    53    38    28    21    18    16    14
## 10 Adams, Yo… Open… 2000-08-26      76    76    74    69    68    67    61    58
## # ℹ 307 more rows
## # ℹ 68 more variables: wk9 <dbl>, wk10 <dbl>, wk11 <dbl>, wk12 <dbl>,
## #   wk13 <dbl>, wk14 <dbl>, wk15 <dbl>, wk16 <dbl>, wk17 <dbl>, wk18 <dbl>,
## #   wk19 <dbl>, wk20 <dbl>, wk21 <dbl>, wk22 <dbl>, wk23 <dbl>, wk24 <dbl>,
## #   wk25 <dbl>, wk26 <dbl>, wk27 <dbl>, wk28 <dbl>, wk29 <dbl>, wk30 <dbl>,
## #   wk31 <dbl>, wk32 <dbl>, wk33 <dbl>, wk34 <dbl>, wk35 <dbl>, wk36 <dbl>,
## #   wk37 <dbl>, wk38 <dbl>, wk39 <dbl>, wk40 <dbl>, wk41 <dbl>, wk42 <dbl>, …
billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank"
  )
## # A tibble: 24,092 × 5
##    artist track                   date.entered week   rank
##    <chr>  <chr>                   <date>       <chr> <dbl>
##  1 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk2      82
##  3 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk3      72
##  4 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk4      77
##  5 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk5      87
##  6 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk6      94
##  7 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk7      99
##  8 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk8      NA
##  9 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk9      NA
## 10 2 Pac  Baby Don't Cry (Keep... 2000-02-26   wk10     NA
## # ℹ 24,082 more rows
billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank",
    values_drop_na = TRUE
  )
## # A tibble: 5,307 × 5
##    artist  track                   date.entered week   rank
##    <chr>   <chr>                   <date>       <chr> <dbl>
##  1 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk1      87
##  2 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk2      82
##  3 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk3      72
##  4 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk4      77
##  5 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk5      87
##  6 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk6      94
##  7 2 Pac   Baby Don't Cry (Keep... 2000-02-26   wk7      99
##  8 2Ge+her The Hardest Part Of ... 2000-09-02   wk1      91
##  9 2Ge+her The Hardest Part Of ... 2000-09-02   wk2      87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02   wk3      92
## # ℹ 5,297 more rows
billboard_longer <- billboard |>
  pivot_longer(
    cols = starts_with("wk"),
    names_to = "week",
    values_to = "rank",
    values_drop_na = TRUE
  ) |>
  mutate(
    week = parse_number(week)
  )
billboard_longer
## # A tibble: 5,307 × 5
##    artist  track                   date.entered  week  rank
##    <chr>   <chr>                   <date>       <dbl> <dbl>
##  1 2 Pac   Baby Don't Cry (Keep... 2000-02-26       1    87
##  2 2 Pac   Baby Don't Cry (Keep... 2000-02-26       2    82
##  3 2 Pac   Baby Don't Cry (Keep... 2000-02-26       3    72
##  4 2 Pac   Baby Don't Cry (Keep... 2000-02-26       4    77
##  5 2 Pac   Baby Don't Cry (Keep... 2000-02-26       5    87
##  6 2 Pac   Baby Don't Cry (Keep... 2000-02-26       6    94
##  7 2 Pac   Baby Don't Cry (Keep... 2000-02-26       7    99
##  8 2Ge+her The Hardest Part Of ... 2000-09-02       1    91
##  9 2Ge+her The Hardest Part Of ... 2000-09-02       2    87
## 10 2Ge+her The Hardest Part Of ... 2000-09-02       3    92
## # ℹ 5,297 more rows
billboard_longer |>
  ggplot(aes(x = week, y = rank, group = track)) +
  geom_line(alpha = 0.25) +
  scale_y_reverse()

df <- tribble(
  ~id, ~bp1, ~bp2,
  "A", 100, 120,
  "B", 140, 115,
  "C", 120, 125
)

df |>
  pivot_longer(
    cols = bp1:bp2,
    names_to = "measurement",
    values_to = "value"
  )
## # A tibble: 6 × 3
##   id    measurement value
##   <chr> <chr>       <dbl>
## 1 A     bp1           100
## 2 A     bp2           120
## 3 B     bp1           140
## 4 B     bp2           115
## 5 C     bp1           120
## 6 C     bp2           125
?who2

who2 |>
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"),
    names_sep = "_",
    values_to = "count"
  )
## # A tibble: 405,440 × 6
##    country      year diagnosis gender age   count
##    <chr>       <dbl> <chr>     <chr>  <chr> <dbl>
##  1 Afghanistan  1980 sp        m      014      NA
##  2 Afghanistan  1980 sp        m      1524     NA
##  3 Afghanistan  1980 sp        m      2534     NA
##  4 Afghanistan  1980 sp        m      3544     NA
##  5 Afghanistan  1980 sp        m      4554     NA
##  6 Afghanistan  1980 sp        m      5564     NA
##  7 Afghanistan  1980 sp        m      65       NA
##  8 Afghanistan  1980 sp        f      014      NA
##  9 Afghanistan  1980 sp        f      1524     NA
## 10 Afghanistan  1980 sp        f      2534     NA
## # ℹ 405,430 more rows
who2 |>
  pivot_longer(
    cols = !(country:year),
    names_to = c("diagnosis", "gender", "age"),
    names_sep = "_",
    values_to = "count"
  )
## # A tibble: 405,440 × 6
##    country      year diagnosis gender age   count
##    <chr>       <dbl> <chr>     <chr>  <chr> <dbl>
##  1 Afghanistan  1980 sp        m      014      NA
##  2 Afghanistan  1980 sp        m      1524     NA
##  3 Afghanistan  1980 sp        m      2534     NA
##  4 Afghanistan  1980 sp        m      3544     NA
##  5 Afghanistan  1980 sp        m      4554     NA
##  6 Afghanistan  1980 sp        m      5564     NA
##  7 Afghanistan  1980 sp        m      65       NA
##  8 Afghanistan  1980 sp        f      014      NA
##  9 Afghanistan  1980 sp        f      1524     NA
## 10 Afghanistan  1980 sp        f      2534     NA
## # ℹ 405,430 more rows
df <- tribble(
  ~id, ~bp1, ~bp2,
  "A", 100, 120,
  "B", 140, 115,
  "C", 120, 125
)

df |> pivot_longer(
  cols = bp1:bp2,
  names_to = "measurement",
  values_to = "amount"
)
## # A tibble: 6 × 3
##   id    measurement amount
##   <chr> <chr>        <dbl>
## 1 A     bp1            100
## 2 A     bp2            120
## 3 B     bp1            140
## 4 B     bp2            115
## 5 C     bp1            120
## 6 C     bp2            125
household
## # A tibble: 5 × 5
##   family dob_child1 dob_child2 name_child1 name_child2
##    <int> <date>     <date>     <chr>       <chr>      
## 1      1 1998-11-26 2000-01-29 Susan       Jose       
## 2      2 1996-06-22 NA         Mark        <NA>       
## 3      3 2002-07-11 2004-04-05 Sam         Seth       
## 4      4 2004-10-10 2009-08-27 Craig       Khai       
## 5      5 2000-12-05 2005-02-28 Parker      Gracie
?household

household |>
  pivot_longer(
    cols = !family,
    names_to = c(".value", "child"),
    names_sep = "_",
    values_drop_na = T,
    values_to = "name"
  )
## # A tibble: 9 × 4
##   family child  dob        name  
##    <int> <chr>  <date>     <chr> 
## 1      1 child1 1998-11-26 Susan 
## 2      1 child2 2000-01-29 Jose  
## 3      2 child1 1996-06-22 Mark  
## 4      3 child1 2002-07-11 Sam   
## 5      3 child2 2004-04-05 Seth  
## 6      4 child1 2004-10-10 Craig 
## 7      4 child2 2009-08-27 Khai  
## 8      5 child1 2000-12-05 Parker
## 9      5 child2 2005-02-28 Gracie
cms_patient_experience |>
  distinct(measure_cd, measure_title)
## # A tibble: 6 × 2
##   measure_cd   measure_title                                                    
##   <chr>        <chr>                                                            
## 1 CAHPS_GRP_1  CAHPS for MIPS SSM: Getting Timely Care, Appointments, and Infor…
## 2 CAHPS_GRP_2  CAHPS for MIPS SSM: How Well Providers Communicate               
## 3 CAHPS_GRP_3  CAHPS for MIPS SSM: Patient's Rating of Provider                 
## 4 CAHPS_GRP_5  CAHPS for MIPS SSM: Health Promotion and Education               
## 5 CAHPS_GRP_8  CAHPS for MIPS SSM: Courteous and Helpful Office Staff           
## 6 CAHPS_GRP_12 CAHPS for MIPS SSM: Stewardship of Patient Resources
cms_patient_experience |>
  pivot_wider(
    id_cols = starts_with("org"),
    names_from = measure_cd,
    values_from = prf_rate
  )
## # A tibble: 95 × 8
##    org_pac_id org_nm CAHPS_GRP_1 CAHPS_GRP_2 CAHPS_GRP_3 CAHPS_GRP_5 CAHPS_GRP_8
##    <chr>      <chr>        <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
##  1 0446157747 USC C…          63          87          86          57          85
##  2 0446162697 ASSOC…          59          85          83          63          88
##  3 0547164295 BEAVE…          49          NA          75          44          73
##  4 0749333730 CAPE …          67          84          85          65          82
##  5 0840104360 ALLIA…          66          87          87          64          87
##  6 0840109864 REX H…          73          87          84          67          91
##  7 0840513552 SCL H…          58          83          76          58          78
##  8 0941545784 GRITM…          46          86          81          54          NA
##  9 1052612785 COMMU…          65          84          80          58          87
## 10 1254237779 OUR L…          61          NA          NA          65          NA
## # ℹ 85 more rows
## # ℹ 1 more variable: CAHPS_GRP_12 <dbl>
cms_patient_experience
## # A tibble: 500 × 5
##    org_pac_id org_nm                           measure_cd measure_title prf_rate
##    <chr>      <chr>                            <chr>      <chr>            <dbl>
##  1 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       63
##  2 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       87
##  3 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       86
##  4 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       57
##  5 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       85
##  6 0446157747 USC CARE MEDICAL GROUP INC       CAHPS_GRP… CAHPS for MI…       24
##  7 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       59
##  8 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       85
##  9 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       83
## 10 0446162697 ASSOCIATION OF UNIVERSITY PHYSI… CAHPS_GRP… CAHPS for MI…       63
## # ℹ 490 more rows
df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "B",        "bp1",    140,
  "B",        "bp2",    115,
  "A",        "bp2",    120,
  "A",        "bp3",    105
)

df |> pivot_wider(
  id_cols = "id", # optional in this case
  names_from = measurement,
  values_from = value,
)
## # A tibble: 2 × 4
##   id      bp1   bp2   bp3
##   <chr> <dbl> <dbl> <dbl>
## 1 A       100   120   105
## 2 B       140   115    NA
df |>
  distinct(measurement) |>
  pull() # extra column
## [1] "bp1" "bp2" "bp3"
df |>
  select(-measurement, -value) |> # all but measurement, value
  distinct() # A B
## # A tibble: 2 × 1
##   id   
##   <chr>
## 1 A    
## 2 B
df |>
  select(-measurement, -value) |>
  distinct() |>
  mutate(x = NA, y = NA, z = NA)
## # A tibble: 2 × 4
##   id    x     y     z    
##   <chr> <lgl> <lgl> <lgl>
## 1 A     NA    NA    NA   
## 2 B     NA    NA    NA
df <- tribble(
  ~id, ~measurement, ~value,
  "A",        "bp1",    100,
  "A",        "bp1",    102,
  "A",        "bp2",    120,
  "B",        "bp1",    140,
  "B",        "bp2",    115
)

df |>
  summarise(
    n = n(),
    .by = c(id, measurement)
  ) |>
  filter(n > 1)
## # A tibble: 1 × 3
##   id    measurement     n
##   <chr> <chr>       <int>
## 1 A     bp1             2
df |>
  # check to see if there are repeat combinations of ID and measurement and, if so, remove the repeat
  distinct(id, measurement, .keep_all = TRUE) |>
  pivot_wider(
    id_cols = id,
    names_from = measurement,
    values_from = value
  )
## # A tibble: 2 × 3
##   id      bp1   bp2
##   <chr> <dbl> <dbl>
## 1 A       100   120
## 2 B       140   115

Ch 6

library(dplyr)
library(nycflights13)

not_cancelled <- flights |>
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled |>
  group_by(year, month, day) |>
  summarize(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ℹ 355 more rows
getwd()
## [1] "/Users/danielstafford/Coding/Tutorials/r4ds"
# always use relative paths! 👀
# Edit a DF interactively!
library(DataEditR)

# mtcars_new <- data_edit(mtcars,
#   save_as = "mtcars_new.csv"
# )

library(ViewPipeSteps)
diamonds
## # A tibble: 53,940 × 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7  0.24 Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8  0.26 Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9  0.22 Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10  0.23 Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ℹ 53,930 more rows
# View pipe steps!
# diamonds %>%
#   select(carat, cut, color, clarity, price) %>%
#   group_by(color) %>%
#   summarise(n = n(), price = mean(price)) %>%
#   arrange(desc(color)) %>%
#   print_pipe_steps()

Ch 7

students <- read_csv("https://pos.it/r4ds-students-csv")
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Full Name, favourite.food, mealPlan, AGE
## dbl (1): Student ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 6
## Columns: 5
## $ `Student ID`   <dbl> 1, 2, 3, 4, 5, 6
## $ `Full Name`    <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", "N/A", "Anchovies…
## $ mealPlan       <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE            <chr> "4", "5", "7", NA, "five", "6"
students <- read_csv("https://pos.it/r4ds-students-csv", na = c("N/A", "")) # capture both empty and N/A strings
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Full Name, favourite.food, mealPlan, AGE
## dbl (1): Student ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 6
## Columns: 5
## $ `Student ID`   <dbl> 1, 2, 3, 4, 5, 6
## $ `Full Name`    <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ mealPlan       <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE            <chr> "4", "5", "7", NA, "five", "6"
# columns names annoying have backticks, remove them
students <- rename(
  students,
  student_id = `Student ID`,
  full_name = `Full Name`
)
glimpse(students)
## Rows: 6
## Columns: 5
## $ student_id     <dbl> 1, 2, 3, 4, 5, 6
## $ full_name      <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ mealPlan       <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE            <chr> "4", "5", "7", NA, "five", "6"
# alterntive automatic method
library(janitor)
students <- read_csv("https://pos.it/r4ds-students-csv", na = c("N/A", ""))
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Full Name, favourite.food, mealPlan, AGE
## dbl (1): Student ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 6
## Columns: 5
## $ `Student ID`   <dbl> 1, 2, 3, 4, 5, 6
## $ `Full Name`    <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite.food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ mealPlan       <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun…
## $ AGE            <chr> "4", "5", "7", NA, "five", "6"
students |> janitor::clean_names()
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan           age  
##        <dbl> <chr>            <chr>              <chr>               <chr>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
## 2          2 Barclay Lynn     French fries       Lunch only          5    
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
## 4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
## 6          6 Güvenç Attila    Ice cream          Lunch only          6
# change cats to factors, fix age
students <- students |>
  janitor::clean_names() |>
  mutate(
    meal_plan = factor(meal_plan), # changes from chr to fct
    age = parse_number(if_else(age == "five", "5", age)) # if age is "five", turn to "5" otherwise leave it along, parse all age strings as numbers
  )
glimpse(students)
## Rows: 6
## Columns: 5
## $ student_id     <dbl> 1, 2, 3, 4, 5, 6
## $ full_name      <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo…
## $ favourite_food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", …
## $ meal_plan      <fct> Lunch only, Lunch only, Breakfast and lunch, Lunch only…
## $ age            <dbl> 4, 5, 7, NA, 5, 6
read_csv(
  "a,b,c
  1,2,3
  4,5,6"
)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
# skip first two rows of csv.
read_csv(
  "The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3",
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 3 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): The first line of metadata
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 3 × 1
##   `The first line of metadata`
##   <chr>                       
## 1 The second line of metadata 
## 2 x,y,z                       
## 3 1,2,3
read_csv(
  "The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3",
  skip = 2
)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
# ignore specific lines
read_csv(
  "/ A comment I want to skip
  x,y,z
  1,2,3",
  comment = "/"
)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
# no col names
read_csv(
  "1,2,3
  4,5,6",
  col_names = FALSE # creates col names
)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): X1, X2, X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
# custom col names
read_csv(
  "1,2,3
  4,5,6",
  col_names = c("x", "y", "z")
)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
# semicolon: csv2, tab: tsv, delim: guess, fwf: fixed width, tableL white space/fixed width, log: apache log
# delim
# na, trim_ws, etc.
#  read_fwf() fwf_empty() - Guesses based on the positions of empty columns. fwf_widths() - Supply the widths of the columns. fwf_positions() - Supply paired vectors of start and end positions. fwf_cols() - Supply named arguments of paired start and end positions or column widths.

read_csv("x,y\n1,'a,b'", quote = "'")
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): y
## dbl (1): x
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b
read_csv("a,b,c\n1,2,3\n4,5,6") # was missing a column
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)

annoying
## # A tibble: 10 × 2
##      `1`   `2`
##    <int> <dbl>
##  1     1  2.05
##  2     2  4.02
##  3     3  6.20
##  4     4  8.19
##  5     5  9.84
##  6     6 13.0 
##  7     7 13.8 
##  8     8 16.9 
##  9     9 18.1 
## 10    10 18.5
getOne <- annoying |>
  select("1")

# Extracting the variable labeled as '1'
annoying |>
  pull(`1`)
##  [1]  1  2  3  4  5  6  7  8  9 10
# scatterplot one vs. two
annoying |>
  ggplot(aes(x = `2`, y = `1`)) +
  geom_point()

# Creating a new column called 3, which is 2 divided by 1.
annoying <- annoying |>
  mutate(
    `3` = `2` / `1`
  )

# Renaming the columns to one, two, and three
annoying |>
  rename("one" = `1`, "two" = `2`, "three" = `3`)
## # A tibble: 10 × 3
##      one   two three
##    <int> <dbl> <dbl>
##  1     1  2.05  2.05
##  2     2  4.02  2.01
##  3     3  6.20  2.07
##  4     4  8.19  2.05
##  5     5  9.84  1.97
##  6     6 13.0   2.16
##  7     7 13.8   1.98
##  8     8 16.9   2.11
##  9     9 18.1   2.02
## 10    10 18.5   1.85
read_csv("
  logical,numeric,date,string
  TRUE,1,2021-01-15,abc
  false,4.5,2021-02-15,def
  T,Inf,2021-02-16,ghi
")
## Rows: 3 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): string
## dbl  (1): numeric
## lgl  (1): logical
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 3 × 4
##   logical numeric date       string
##   <lgl>     <dbl> <date>     <chr> 
## 1 TRUE        1   2021-01-15 abc   
## 2 FALSE       4.5 2021-02-15 def   
## 3 TRUE      Inf   2021-02-16 ghi
another_csv <- "
x,y,z
1,2,3"

read_csv(
  another_csv,
  col_types = cols(.default = col_character())
)
## # A tibble: 1 × 3
##   x     y     z    
##   <chr> <chr> <chr>
## 1 1     2     3
sales_files <- c(
  "https://pos.it/r4ds-01-sales",
  "https://pos.it/r4ds-02-sales",
  "https://pos.it/r4ds-03-sales"
)
read_csv(sales_files, id = "file") # id argument adds a new column called file to the resulting data frame that identifies the file the data come from.
## Rows: 19 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): month
## dbl (4): year, brand, item, n
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 19 × 6
##    file                         month     year brand  item     n
##    <chr>                        <chr>    <dbl> <dbl> <dbl> <dbl>
##  1 https://pos.it/r4ds-01-sales January   2019     1  1234     3
##  2 https://pos.it/r4ds-01-sales January   2019     1  8721     9
##  3 https://pos.it/r4ds-01-sales January   2019     1  1822     2
##  4 https://pos.it/r4ds-01-sales January   2019     2  3333     1
##  5 https://pos.it/r4ds-01-sales January   2019     2  2156     9
##  6 https://pos.it/r4ds-01-sales January   2019     2  3987     6
##  7 https://pos.it/r4ds-01-sales January   2019     2  3827     6
##  8 https://pos.it/r4ds-02-sales February  2019     1  1234     8
##  9 https://pos.it/r4ds-02-sales February  2019     1  8721     2
## 10 https://pos.it/r4ds-02-sales February  2019     1  1822     3
## 11 https://pos.it/r4ds-02-sales February  2019     2  3333     1
## 12 https://pos.it/r4ds-02-sales February  2019     2  2156     3
## 13 https://pos.it/r4ds-02-sales February  2019     2  3987     6
## 14 https://pos.it/r4ds-03-sales March     2019     1  1234     3
## 15 https://pos.it/r4ds-03-sales March     2019     1  3627     1
## 16 https://pos.it/r4ds-03-sales March     2019     1  8820     3
## 17 https://pos.it/r4ds-03-sales March     2019     2  7253     1
## 18 https://pos.it/r4ds-03-sales March     2019     2  8766     3
## 19 https://pos.it/r4ds-03-sales March     2019     2  8288     6
sales_files <- list.files("data", pattern = "sales\\.csv$", full.names = TRUE)
sales_files
## [1] "data/01-sales.csv" "data/02-sales.csv" "data/03-sales.csv"
students
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan             age
##        <dbl> <chr>            <chr>              <fct>               <dbl>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
## 2          2 Barclay Lynn     French fries       Lunch only              5
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
## 4          4 Leon Rossini     Anchovies          Lunch only             NA
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
## 6          6 Güvenç Attila    Ice cream          Lunch only              6
write_csv(students, "data/students-2.csv")
read_csv("data/students-2.csv") # note that we lose col type for meal_plan
## Rows: 6 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): full_name, favourite_food, meal_plan
## dbl (2): student_id, age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan             age
##        <dbl> <chr>            <chr>              <chr>               <dbl>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
## 2          2 Barclay Lynn     French fries       Lunch only              5
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
## 4          4 Leon Rossini     Anchovies          Lunch only             NA
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
## 6          6 Güvenç Attila    Ice cream          Lunch only              6
# custom R's binary RDS
write_rds(students, "data/students.rds")
read_rds("data/students.rds")
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan             age
##        <dbl> <chr>            <chr>              <fct>               <dbl>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
## 2          2 Barclay Lynn     French fries       Lunch only              5
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
## 4          4 Leon Rossini     Anchovies          Lunch only             NA
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
## 6          6 Güvenç Attila    Ice cream          Lunch only              6
# arrow binary for many languages
library(arrow)
## Warning: package 'arrow' was built under R version 4.2.3
## Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
## 
## Attaching package: 'arrow'
## 
## The following object is masked from 'package:lubridate':
## 
##     duration
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
students
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan             age
##        <dbl> <chr>            <chr>              <fct>               <dbl>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
## 2          2 Barclay Lynn     French fries       Lunch only              5
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
## 4          4 Leon Rossini     Anchovies          Lunch only             NA
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
## 6          6 Güvenç Attila    Ice cream          Lunch only              6
# write_parquet(students, "students.parquet")
# read_parquet("students.parquet")
# by column which is a bit weird
tibble(
  x = c(1, 2, 5),
  y = c("h", "m", "g"),
  z = c(0.08, 0.83, 0.60)
)
## # A tibble: 3 × 3
##       x y         z
##   <dbl> <chr> <dbl>
## 1     1 h      0.08
## 2     2 m      0.83
## 3     5 g      0.6
# by row which is easier
tribble(
  ~x, ~y, ~z,
  1, "h", 0.08,
  2, "m", 0.83,
  5, "g", 0.60
)
## # A tibble: 3 × 3
##       x y         z
##   <dbl> <chr> <dbl>
## 1     1 h      0.08
## 2     2 m      0.83
## 3     5 g      0.6

Ch 8

y <- 1:4
mean(y)
## [1] 2.5
dput(mtcars)
## structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
## 24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
## 30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
## 19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
## 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
##     disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
##     167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
##     71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
##     301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 
##     123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 
##     150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 
##     3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
##     3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
##     3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
##     ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
##     3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
##     1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
##     1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
##     19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
##     18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
##     17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
##     ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
##     0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1, 
##     1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
##     0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, 
##     3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
##     3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4, 
##     2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
##     2, 2, 4, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag", 
## "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
## "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", 
## "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
## "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
## "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
## "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
## "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
## "Volvo 142E"), class = "data.frame")

Ch 9

mpg
## # A tibble: 234 × 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4           1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4           2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4           2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4           2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4           2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4           3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 quattro   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 quattro   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 quattro   2    2008     4 manu… 4        20    28 p     comp…
## # ℹ 224 more rows
mpg |>
  ggplot(aes(x = displ, y = hwy, color = class)) +
  geom_point()

mpg |>
  ggplot(aes(x = displ, y = hwy, size = class)) +
  geom_point()
## Warning: Using size for a discrete variable is not advised.

mpg |>
  ggplot(aes(x = displ, y = hwy, alpha = class)) +
  geom_point()
## Warning: Using alpha for a discrete variable is not advised.

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "blue")

# Create a scatterplot of hwy vs. displ where the points are pink filled in triangles.
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "pink", shape = 17)

# Why did the following code not result in a plot with blue points?
ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy, color = "blue")) # aes should be defined outisde of aes

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy), color = "blue")

# what does the stroke aes do ?

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy), stroke = 1) # adjust size, thickness

mpg |> ggplot(aes(x = displ, y = hwy, color = displ < 5)) +
  geom_point() # true / false with diff colors

ggplot(mpg, aes(x = displ, y = hwy, shape = drv)) +
  geom_point()

ggplot(mpg, aes(x = displ, y = hwy, linetype = drv)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  geom_smooth(aes(linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Left
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Middle
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(group = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Right
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv), show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_point(
    data = mpg |> filter(class == "2seater"),
    color = "red"
  ) +
  geom_point(
    data = mpg |> filter(class == "2seater"),
    shape = "circle open", size = 3, color = "red"
  )

# Left
ggplot(mpg, aes(x = hwy)) +
  geom_histogram(binwidth = 2)

# Middle
ggplot(mpg, aes(x = hwy)) +
  geom_density()

# Right
ggplot(mpg, aes(x = hwy)) +
  geom_boxplot()

library(ggridges)
## Warning: package 'ggridges' was built under R version 4.2.3
ggplot(mpg, aes(x = hwy, y = drv, fill = drv, color = drv)) +
  geom_density_ridges(alpha = 0.5, show.legend = FALSE)
## Picking joint bandwidth of 1.28

# What geom would you use to draw a line chart? A boxplot? A histogram? An area chart?
# linear chart - gemo_point
# boxplot - geom_box
# histogram - geom_histogram
# area chart - geom_area

# Earlier in this chapter we used show.legend without explaining it:

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv), show.legend = F)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_smooth(aes(color = drv), show.legend = F, se = F)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# se = Display confidence interval around smooth? (TRUE by default, see level to control.)
# hides legend, more space

# Recreate the R code necessary to generate the following graphs. Note that wherever a categorical variable is used in the plot, it’s drv

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(stroke = 2) +
  geom_smooth(se = F)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, )) +
  geom_point(stroke = 2) +
  geom_smooth(se = F, aes(group = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(stroke = 2) +
  geom_smooth(se = F, aes(color = drv), show.legend = T)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(stroke = 2, aes(color = drv)) +
  geom_smooth(se = F, show.legend = T, aes(linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(shape = 16, size = 7, color = "white") +
  geom_point(aes(color = drv, stroke = 2))

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_wrap(~cyl)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl, scales = "free_y") # different scales in columns, helps visualize better

# What happens if you facet on a continuous variable?

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_wrap(~cty) # with cont. var, get sum of unique values

mpg |> count(cty)
## # A tibble: 21 × 2
##      cty     n
##    <int> <int>
##  1     9     5
##  2    11    20
##  3    12     8
##  4    13    21
##  5    14    19
##  6    15    24
##  7    16    19
##  8    17    16
##  9    18    26
## 10    19    20
## # ℹ 11 more rows
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(drv ~ cyl)

# What do the empty cells in the plot above with facet_grid(drv ~ cyl) mean? Run the following code. How do they relate to the resulting plot?

ggplot(mpg) +
  geom_point(aes(x = drv, y = cyl))

# certain combination do not exsist, for instance , there are no rear wheel drive train with 1 cylinder

mpg %>%
  filter(drv == "r" & cyl == 4)
## # A tibble: 0 × 11
## # ℹ 11 variables: manufacturer <chr>, model <chr>, displ <dbl>, year <int>,
## #   cyl <int>, trans <chr>, drv <chr>, cty <int>, hwy <int>, fl <chr>,
## #   class <chr>
# What plots does the following code make? What does . do?

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(. ~ cyl)

# Take the first faceted plot in this section:

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~class, nrow = 2)

# What are the advantages to using faceting instead of the color aesthetic? What are the disadvantages? How might the balance change if you had a larger dataset?

# facets allow for different scales (y free) and alllow for more breathing spaces. Larger data sets might mean too many grids though.

# Read ?facet_wrap. What does nrow do? What does ncol do? What other options control the layout of the individual panels? Why doesn’t facet_grid() have nrow and ncol arguments?

?facet_wrap
# nrow, ncol     Number of rows and columns.
ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_wrap(~class, nrow = 4) # can make more vertical orientation, for instance, grid won't have these options as they are already set

# Which of the following plots makes it easier to compare engine size (displ) across cars with different drive trains? What does this say about when to place a faceting variable across rows or columns?

ggplot(mpg, aes(x = displ)) +
  geom_histogram() +
  facet_grid(drv ~ .) # by rows (much better)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mpg, aes(x = displ)) +
  geom_histogram() +
  facet_grid(. ~ drv) # columns compete with x=displ
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Recreate the following plot using facet_wrap() instead of facet_grid(). How do the positions of the facet labels change?

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(mpg) +
  geom_point(aes(x = displ, y = hwy)) +
  facet_grid(drv ~ ., switch = "y") # move label to otherside

ggplot(diamonds, aes(x = cut)) +
  geom_bar()

?geom_bar

diamonds |>
  count(cut) |>
  ggplot(aes(x = cut, y = n)) +
  geom_bar(stat = "identity")

ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
  geom_bar()

ggplot(diamonds) +
  stat_summary(
    aes(x = cut, y = depth),
    fun.min = min,
    fun.max = max,
    fun = median
  )

# What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function?

# Uses "geom = pointrange" by default

diamonds |>
  group_by(cut) |>
  summarize(
    lower = min(depth),
    upper = max(depth),
    midpoint = median(depth)
  ) |>
  ggplot(aes(x = cut, y = midpoint)) +
  geom_pointrange(aes(ymin = lower, ymax = upper))

# What does geom_col() do? How is it different from geom_bar()?

# geom_col represents values in the data, why geom_bar uses counts

ggplot(diamonds, aes(x = cut)) +
  geom_bar()

ggplot(diamonds, aes(x = cut, y = depth)) +
  geom_col()

# Most geoms and stats come in pairs that are almost always used in concert. Make a list of all the pairs. What do they have in common? (Hint: Read through the documentation.)

# geom                  stat
# geom_bar()              stat_count()
# geom_bin2d()          stat_bin_2d()
# geom_boxplot()        stat_boxplot()
# geom_contour_filled() stat_contour_filled()
# geom_contour()          stat_contour()
# geom_count()          stat_sum()
# geom_density_2d()     stat_density_2d()
# geom_density()          stat_density()
# geom_dotplot()          stat_bindot()
# geom_function()         stat_function()
# geom_sf()             stat_sf()
# geom_sf()             stat_sf()
# geom_smooth()         stat_smooth()
# geom_violin()         stat_ydensity()
# geom_hex()              stat_bin_hex()
# geom_qq_line()          stat_qq_line()
# geom_qq()             stat_qq()
# geom_quantile()         stat_quantile()

# What variables does stat_smooth() compute? What arguments control its behavior?
?stat_smooth
# predicted value, lower CI from mean, upper CI from mean, and SE

# In our proportion bar chart, we needed to set group = 1. Why? In other words, what is the problem with these two graphs?

# In the first pair of plots, we see that setting group = 1 results in the marginal proportions of cuts being plotted. In the second pair of plots, setting group = color results in the proportions of colors within each cut being plotted.
# one variable
ggplot(diamonds, aes(x = cut, y = after_stat(prop))) +
  geom_bar()

ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
  geom_bar() # after_stat(prop) represents the proportion of each category. The group = 1 argument is used to ensure the proportion are grouped

# two variables
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) +
  geom_bar()

ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = color)) +
  geom_bar()

# Color
ggplot(mpg, aes(x = drv, color = drv)) +
  geom_bar()

# Fill
ggplot(mpg, aes(x = drv, fill = drv)) +
  geom_bar()

# fill with another class other than x
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar()

# identity
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar(alpha = 1 / 5, position = "identity") # create overlap, so use alpha for transparency, The identity position adjustment is more useful for 2d geoms, like points, where it is the default.

# transparent
ggplot(mpg, aes(x = drv, color = class)) +
  geom_bar(fill = NA, position = "identity") # completely transparent by setting fill = NA.

# fill
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar(position = "fill") # "fill" works like stacking, but makes each set of stacked bars the same height. This makes it easier to compare proportions across groups.

# dodge
ggplot(mpg, aes(x = drv, fill = class)) +
  geom_bar(position = "dodge") # "dodge" places overlapping objects directly beside one another. This makes it easier to compare individual values.

# fixing overplotting (one plot containg many values)
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(position = "jitter") # adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise.

# shorthand for jitter
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_jitter()

# What is the problem with the following plot? How could you improve it?

ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point()

# jitter it to avoid overplotting
ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_jitter()

# What, if anything, is the difference between the two plots? Why?

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point()

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(position = "identity")

# no differene, identity is default

# What parameters to geom_jitter() control the amount of jittering?
?geom_jitter # width, height, defaults to .4. this means the jitter values will occupy 80% (twice value of .4) of the implied bins. Categorical data is aligned on the integers, so a width or height of 0.5 will spread the data so it's not possible to see the distinction between the categories.

ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_jitter(width = .8, height = .8)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_jitter()

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_count() # size by overlap

# What’s the default position adjustment for geom_boxplot()? Create a visualization of the mpg dataset that demonstrates it.
?geom_boxplot # position = "dodge2"

ggplot(mpg, aes(x = cty, y = displ)) +
  geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

ggplot(mpg, aes(x = cty, y = displ)) +
  geom_boxplot(position = "dodge2")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

nz <- map_data("nz")

ggplot(nz, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white", color = "black")

ggplot(nz, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "white", color = "black") +
  coord_quickmap() # sets the aspect ratio correctly for geographic maps.

bar <- ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = clarity, fill = clarity),
    show.legend = FALSE,
    width = 1
  ) +
  theme(aspect.ratio = 1)

bar

bar + coord_flip() # flipped

bar + coord_polar() # bar/Coxcomb

# Turn a stacked bar chart into a pie chart using coord_polar().
mpg |> ggplot(aes(x = "", fill = class)) + # stacked bar
  geom_bar() +
  coord_polar(theta = "y") # pie chart, theta ariable to map angle to (x or y), default = x

# What’s the difference between coord_quickmap() and coord_map()?
?coord_quickmap # quickmap: approximate projection, good for countries near equator, map: more computation, as no straight lines

# What does the following plot tell you about the relationship between city and highway mpg? Why is coord_fixed() important? What does geom_abline() do?

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() +
  geom_abline() + # diagonal reference line where highway = city mileage, shows hwy mileage always higher than city
  coord_fixed() # 1 unit on y axis is same length as 1 unit on x axis,

# ggplot(data = <DATA>) +
#   <GEOM_FUNCTION>(
#      mapping = aes(<MAPPINGS>),
#      stat = <STAT>,
#      position = <POSITION>
#   ) +
#   <COORDINATE_FUNCTION> +
#   <FACET_FUNCTION>

Ch. 10: Exploratory data analysis

ggplot(diamonds, aes(x = carat)) +
  geom_histogram(binwidth = 0.5)

smaller <- diamonds |>
  filter(carat < 3)

ggplot(smaller, aes(x = carat)) +
  geom_histogram(binwidth = 0.01)

glimpse(smaller)
## Rows: 53,900
## Columns: 10
## $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
## $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
## $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
## $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
## $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
## $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
## $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
## $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
## $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
## $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
ggplot(diamonds, aes(x = y)) +
  geom_histogram(binwidth = 0.5)

ggplot(diamonds, aes(x = y)) +
  geom_histogram(binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50)) #  zoom to small values of the y-axis with coord_cartesian(): coord_cartesian() also has an xlim() argument for when you need to zoom into the x-axis. ggplot2 also has xlim() and ylim() functions that work slightly differently: they throw away the data outside the limits.

unusual <- diamonds |>
  filter(y < 3 | y > 20) |>
  select(price, x, y, z) |>
  arrange(y)
unusual
## # A tibble: 9 × 4
##   price     x     y     z
##   <int> <dbl> <dbl> <dbl>
## 1  5139  0      0    0   
## 2  6381  0      0    0   
## 3 12800  0      0    0   
## 4 15686  0      0    0   
## 5 18034  0      0    0   
## 6  2130  0      0    0   
## 7  2130  0      0    0   
## 8  2075  5.15  31.8  5.12
## 9 12210  8.09  58.9  8.06
# Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.

ggplot(diamonds, aes(x = x)) +
  geom_histogram(binwidxth = 0.5)
## Warning in geom_histogram(binwidxth = 0.5): Ignoring unknown parameters:
## `binwidxth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds, aes(x = y)) +
  geom_histogram(binwidth = 0.5)

ggplot(diamonds, aes(x = z)) +
  geom_histogram(binwidth = 0.5)

summary(diamonds$x) # 10.7
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.710   5.700   5.731   6.540  10.740
summary(diamonds$y) # 58.9
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.720   5.710   5.735   6.540  58.900
summary(diamonds$z) # 31
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.910   3.530   3.539   4.040  31.800
# Appears to relate to vertical orientation

# Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3)

# How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?

diamonds |>
  filter(carat == 0.99) |>
  count() # 23
## # A tibble: 1 × 1
##       n
##   <int>
## 1    23
diamonds |>
  filter(carat == 1) |>
  count() # 1538
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1558
# Compare and contrast coord_cartesian() vs. xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3, )

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3) +
  xlim(0, 10000) +
  ylim(0, 100) # no plotting  beyond limits
## Warning: Removed 5222 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 153 rows containing missing values or values outside the scale range
## (`geom_bar()`).

ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 3) +
  coord_cartesian(xlim = c(0, 10000), ylim = c(0, 100)) # still plots data beyond limits

# drop the row with strange values, not recommended
diamonds2 <- diamonds |>
  filter(between(y, 3, 20))

# better to replace with missing values
diamonds2 <- diamonds |>
  mutate(y = if_else(y < 3 | y > 20, NA, y))

# ggoplot will warn they've been removed
ggplot(diamonds2, aes(x = x, y = y)) +
  geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

# to supress the warning
ggplot(diamonds2, aes(x = x, y = y)) +
  geom_point(na.rm = TRUE)

nycflights13::flights |>
  mutate(
    cancelled = is.na(dep_time), # NA = flight canceled, so plot!
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + (sched_min / 60)
  ) |>
  ggplot(aes(x = sched_dep_time)) +
  geom_freqpoly(aes(color = cancelled), binwidth = 1 / 4)

# However this plot isn’t great because there are many more non-cancelled flights than cancelled flights.
# What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference in how missing values are handled in histograms and bar charts?

diamonds2 |> ggplot(aes(x = y)) +
  geom_boxplot() # missing values ignored
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

diamonds2 |> ggplot(aes(x = y)) +
  geom_bar() # appears to allow ignore?
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_count()`).

# What does na.rm = TRUE do in mean() and sum()?
diamonds2 |> summarise(
  ymean = mean(y) # not remove, NA will result
)
## # A tibble: 1 × 1
##   ymean
##   <dbl>
## 1    NA
diamonds2 |> summarise(
  ymean = mean(y, na.rm = T) # now works
)
## # A tibble: 1 × 1
##   ymean
##   <dbl>
## 1  5.73
diamonds2 |> summarise(
  ymean = sum(y) # not removed, NA will result
)
## # A tibble: 1 × 1
##   ymean
##   <dbl>
## 1    NA
diamonds2 |> summarise(
  ymean = sum(y, na.rm = T) # now works
)
## # A tibble: 1 × 1
##     ymean
##     <dbl>
## 1 309230.
# Recreate the frequency plot of scheduled_dep_time colored by whether the flight was cancelled or not. Also facet by the cancelled variable. Experiment with different values of the scales variable in the faceting function to mitigate the effect of more non-cancelled flights than cancelled flights.

nycflights13::flights |>
  mutate(
    cancelled = is.na(dep_time)
  ) |>
  ggplot(aes(x = sched_dep_time)) +
  geom_freqpoly(aes(color = cancelled), binwidth = 1 / 4) +
  facet_wrap(~cancelled, scales = "free_y") # zoom in on y range

ggplot(diamonds, aes(x = price)) +
  geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75) # default appearance of geom_freqpoly() is not that useful here because the height, determined by the overall count, differs so much across cuts, making it hard to see the differences in the shapes of their distributions.

# we’ll display the density, which is the count standardized so that the area under each frequency polygon is one.  Note that we’re mapping the density to y, but since density is not a variable in the diamonds dataset, we need to first calculate it. We use the after_stat() function to do so.
ggplot(diamonds, aes(x = price, y = after_stat(density))) +
  geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75)

# visually simpler plot for exploring this relationship is using side-by-side boxplots.
ggplot(diamonds, aes(x = cut, y = price)) +
  geom_boxplot()

ggplot(mpg, aes(x = class, y = hwy)) +
  geom_boxplot() # quite scattered

ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
  geom_boxplot() # reorder class based on the median value of hwy:

ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
  geom_boxplot() +
  coord_flip() # to help with long names

# Use what you’ve learned to improve the visualization of the departure times of cancelled vs. non-cancelled flights.
nycflights13::flights |>
  mutate(
    cancelled = is.na(dep_time)
  ) |>
  ggplot(aes(x = sched_dep_time, y = after_stat(density))) +
  geom_freqpoly(aes(color = cancelled), binwidth = 3)

# Based on EDA, what variable in the diamonds dataset appears to be most important for predicting the price of a diamond? How is that variable correlated with cut? Why does the combination of those two relationships lead to lower quality diamonds being more expensive?

# TO DO

# Instead of exchanging the x and y variables, add coord_flip() as a new layer to the vertical boxplot to create a horizontal one. How does this compare to exchanging the variables?

# no difference apparently
ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
  geom_boxplot() +
  coord_flip()

ggplot(mpg, aes(y = fct_reorder(class, hwy, median), x = hwy)) +
  geom_boxplot()

# One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of “outlying values”. One approach to remedy this problem is the letter value plot. Install the lvplot package, and try using geom_lv() to display the distribution of price vs. cut. What do you learn? How do you interpret the plots?

diamonds |> ggplot(aes(x = cut, y = price)) +
  geom_lv()

# useful for larger datasets, many more quantiles.

# Create a visualization of diamond prices vs. a categorical variable from the diamonds dataset using geom_violin(), then a faceted geom_histogram(), then a colored geom_freqpoly(), and then a colored geom_density(). Compare and contrast the four plots. What are the pros and cons of each method of visualizing the distribution of a numerical variable based on the levels of a categorical variable?

# no overlaps but look the same after 5000 count,
diamonds |> ggplot(aes(x = color, y = price)) +
  geom_violin()

# no overlaps but quite small
diamonds %>%
  ggplot(aes(x = price)) +
  geom_histogram(show.legend = F) +
  facet_wrap(~color)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# not too bad, gets hard to read when counts are similiar
diamonds |> ggplot(aes(x = price, color = color)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# overlaps but easier to see differences
diamonds |> ggplot(aes(x = price, fill = color)) +
  geom_density()

# If you have a small dataset, it’s sometimes useful to use geom_jitter() to avoid overplotting to more easily see the relationship between a continuous and categorical variable. The ggbeeswarm package provides a number of methods similar to geom_jitter(). List them and briefly describe what each one does.

# visit https://github.com/eclarke/ggbeeswarm
# geom_beeswarm - a beewarm?
# geom_quasirandom - imilarly to geom_jitter but reducing overplotting using a van der Corput sequence or Tukey texturing.
# count the number of observations for each combination of levels of these categorical variables. One way to do that is to rely on the built-in geom_count():
ggplot(diamonds, aes(x = cut, y = color)) +
  geom_count()

# alternative
diamonds |>
  count(color, cut) |>
  arrange(desc(n))
## # A tibble: 35 × 3
##    color cut           n
##    <ord> <ord>     <int>
##  1 G     Ideal      4884
##  2 E     Ideal      3903
##  3 F     Ideal      3826
##  4 H     Ideal      3115
##  5 G     Premium    2924
##  6 D     Ideal      2834
##  7 E     Very Good  2400
##  8 H     Premium    2360
##  9 E     Premium    2337
## 10 F     Premium    2331
## # ℹ 25 more rows
diamonds |>
  count(color, cut) |>
  ggplot(aes(x = color, y = cut)) +
  geom_tile(aes(fill = n))

# 10.5.2.1 Exercises
# How could you rescale the count dataset above to more clearly show the distribution of cut within color, or color within cut?

diamonds |>
  count(color, cut) |>
  group_by(color) |>
  mutate(percent_cut = n / sum(n))
## # A tibble: 35 × 4
## # Groups:   color [7]
##    color cut           n percent_cut
##    <ord> <ord>     <int>       <dbl>
##  1 D     Fair        163      0.0241
##  2 D     Good        662      0.0977
##  3 D     Very Good  1513      0.223 
##  4 D     Premium    1603      0.237 
##  5 D     Ideal      2834      0.418 
##  6 E     Fair        224      0.0229
##  7 E     Good        933      0.0952
##  8 E     Very Good  2400      0.245 
##  9 E     Premium    2337      0.239 
## 10 E     Ideal      3903      0.398 
## # ℹ 25 more rows
diamonds |>
  count(color, cut) |>
  group_by(cut) |> #
  mutate(percent_color = n / sum(n))
## # A tibble: 35 × 4
## # Groups:   cut [5]
##    color cut           n percent_color
##    <ord> <ord>     <int>         <dbl>
##  1 D     Fair        163         0.101
##  2 D     Good        662         0.135
##  3 D     Very Good  1513         0.125
##  4 D     Premium    1603         0.116
##  5 D     Ideal      2834         0.132
##  6 E     Fair        224         0.139
##  7 E     Good        933         0.190
##  8 E     Very Good  2400         0.199
##  9 E     Premium    2337         0.169
## 10 E     Ideal      3903         0.181
## # ℹ 25 more rows
# What different data insights do you get with a segmented bar chart if color is mapped to the x aesthetic and cut is mapped to the fill aesthetic? Calculate the counts that fall into each of the segments.

diamonds |> ggplot(aes(x = color, fill = cut)) +
  geom_bar()

diamonds_counts <- diamonds %>%
  count(color, cut)

ggplot(diamonds_counts, aes(x = color, y = n, fill = cut)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), position = position_stack(vjust = 0.2))

# Use geom_tile() together with dplyr to explore how average flight departure delays vary by destination and month of year. What makes the plot difficult to read? How could you improve it?

nycflights13::flights %>%
  group_by(month, dest) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  group_by(dest) %>%
  ungroup() %>%
  mutate(dest = reorder(dest, dep_delay)) %>%
  ggplot(aes(
    x = factor(month.name[month], levels = month.name),
    y = dest,
    fill = dep_delay
  )) +
  geom_tile() +
  labs(x = "Month", y = "Destination", fill = "Departure Delay") +
  scale_fill_gradient(low = "white", high = "red") +
  theme(
    axis.text.y = element_text(size = 4),
    axis.text.x = element_text(size = 7),
    legend.position = "bottom"
  )
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(smaller, aes(x = carat, y = price)) +
  geom_point()

ggplot(smaller, aes(x = carat, y = price)) +
  geom_point(alpha = 1 / 100) # using the alpha aesthetic to add transparency.

ggplot(smaller, aes(x = carat, y = price)) +
  geom_bin2d()

ggplot(smaller, aes(x = carat, y = price)) +
  geom_hex()

ggplot(smaller, aes(x = carat, y = price)) +
  geom_boxplot(aes(group = cut_width(carat, 0.1)))

# Instead of summarizing the conditional distribution with a boxplot, you could use a frequency polygon. What do you need to consider when using cut_width() vs. cut_number()? How does that impact a visualization of the 2d distribution of carat and price?

# Visualize the distribution of carat, partitioned by price.

# How does the price distribution of very large diamonds compare to small diamonds? Is it as you expect, or does it surprise you?

# Combine two of the techniques you’ve learned to visualize the combined distribution of cut, carat, and price.

# Two dimensional plots reveal outliers that are not visible in one dimensional plots. For example, some points in the following plot have an unusual combination of x and y values, which makes the points outliers even though their x and y values appear normal when examined separately. Why is a scatterplot a better display than a binned plot for this case?

diamonds |>
  filter(x >= 4) |>
  ggplot(aes(x = x, y = y)) +
  geom_point() +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

# Instead of creating boxes of equal width with cut_width(), we could create boxes that contain roughly equal number of points with cut_number(). What are the advantages and disadvantages of this approach?

ggplot(smaller, aes(x = carat, y = price)) +
  geom_boxplot(aes(group = cut_number(carat, 20)))

diamonds <- diamonds |>
  mutate(
    log_price = log(price),
    log_carat = log(carat)
  )

diamonds_fit <- linear_reg() |>
  fit(log_price ~ log_carat, data = diamonds)

diamonds_aug <- augment(diamonds_fit, new_data = diamonds) |>
  mutate(.resid = exp(.resid))

ggplot(diamonds_aug, aes(x = carat, y = .resid)) +
  geom_point()

ggplot(diamonds_aug, aes(x = cut, y = .resid)) +
  geom_boxplot()

Chapter 11

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE) +
  labs(
    x = "Engine displacement (L)",
    y = "Highway fuel economy (mpg)",
    color = "Car type",
    title = "Fuel efficiency generally decreases with engine size",
    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
    caption = "Data from fueleconomy.gov"
  )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

df <- tibble(
  x = 1:10,
  y = cumsum(x^2)
)

ggplot(df, aes(x, y)) +
  geom_point() +
  labs(
    x = quote(x[i]),
    y = quote(sum(x[i]^2, i == 1, n))
  )

# Create one plot on the fuel economy data with customized title, subtitle, caption, x, y, and color labels.
names(mpg)
##  [1] "manufacturer" "model"        "displ"        "year"         "cyl"         
##  [6] "trans"        "drv"          "cty"          "hwy"          "fl"          
## [11] "class"
?mpg
mpg |> ggplot(aes(x = displ, y = cty, color = fl)) +
  geom_point() +
  labs(
    x = "Engine displacement (L)",
    y = "City fuel economy (mpg)",
    color = "Fue",
    title = "Disel tends",
    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
    caption = "Data from fueleconomy.gov"
  )

Label exercises

mpg |> ggplot(aes(cty, hwy, color = factor(class))) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue", se = F) + # color to prevent multiple lines per class
  labs(
    x = "MPG in the city",
    y = "MPG on the highway",
    color = "Car class",
    title = "Highway vs. city MPG",
    subtitle = "SUVs are terrible!"
  )
## `geom_smooth()` using formula = 'y ~ x'

Recreate the following plot using the fuel economy data. Note that both the colors and shapes of points vary by type of drive train.

mpg |> ggplot(aes(cty, hwy)) +
  geom_point(aes(color = factor(drv), shape = factor(drv))) +
  labs(
    x = "MPG in the city",
    y = "MPG on the highway",
    color = "Type of drive train",
    shape = "Type of drive train", # identical to avoid two legends
    title = "Highway vs. city MPG",
    subtitle = "SUVs are terrible!"
  )

11.3 Annotations!

label_info <- mpg |>
  group_by(drv) |>
  arrange(desc(displ)) |>
  slice_head(n = 1) |> # pull out the cars with the highest engine size in each drive type
  mutate(
    drive_type = case_when(
      drv == "f" ~ "front-wheel drive",
      drv == "r" ~ "rear-wheel drive",
      drv == "4" ~ "4-wheel drive"
    )
  ) |>
  select(displ, hwy, drv, drive_type)

label_info
## # A tibble: 3 × 4
## # Groups:   drv [3]
##   displ   hwy drv   drive_type       
##   <dbl> <int> <chr> <chr>            
## 1   6.5    17 4     4-wheel drive    
## 2   5.3    25 f     front-wheel drive
## 3   7      24 r     rear-wheel drive
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_text(
    data = label_info,
    aes(x = displ, y = hwy, label = drive_type),
    fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
  ) +
  theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_text(
    data = label_info,
    aes(x = displ, y = hwy, label = drive_type),
    fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
  ) + # hjust (horizontal justification) and vjust (vertical justification) to control the alignment of the label.
  theme(legend.position = "none")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = FALSE) +
  geom_label_repel( #  We can use the geom_label_repel() function from the ggrepel package to address overlap
    data = label_info,
    aes(x = displ, y = hwy, label = drive_type),
    fontface = "bold", size = 5, nudge_y = 2 # Using the fontface and size arguments we can customize the look of the text labels
  ) +
  theme(legend.position = "none") # (theme(legend.position = "none") turns all the legends off
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

potential_outliers <- mpg |>
  filter(hwy > 40 | (hwy > 20 & displ > 5))

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_text_repel(data = potential_outliers, aes(label = model)) +
  geom_point(data = potential_outliers, color = "red") + # make outlier points red
  geom_point(
    data = potential_outliers,
    color = "red", size = 3, shape = "circle open"
  ) # circle around circle

trend_text <- "Larger engine sizes tend to have lower fuel economy." |>
  str_wrap(width = 30)
trend_text
## [1] "Larger engine sizes tend to\nhave lower fuel economy."
# annotation!
ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  annotate(
    geom = "label", x = 3.5, y = 38,
    label = trend_text,
    hjust = "left", color = "red"
  ) +
  annotate(
    geom = "segment",
    x = 3, y = 35, xend = 5, yend = 25, color = "red",
    arrow = arrow(type = "closed")
  )

11.3.1 Exercises with Annotations Use geom_text() with infinite positions to place text at the four corners of the plot.

corner_labels <- tibble(
  x = c(-Inf, -Inf, Inf, Inf),
  y = c(-Inf, Inf, -Inf, Inf),
  label = c(
    "(x0,y0)", "(x0,y1)",
    "(x1,y0)", "(x1,y1)"
  ),
  hjust = c(0, 0, 1, 1),
  vjust = c(0, 1, 0, 1)
)

annoation_plot <- mpg |> ggplot(aes(x = displ, y = hwy)) +
  geom_point() +
  geom_text(data = corner_labels, aes(
    x = x, y = y, hjust = hjust, vjust = vjust, label = label
  ))
annoation_plot

Use annotate() to add a point geom in the middle of your last plot without having to create a tibble. Customize the shape, size, or color of the point.

annoation_plot + annotate(
  geom = "point", x = 4.5, y = 30,
  color = "pink", size = 15, shape = 15
)

How do labels with geom_text() interact with faceting? How can you add a label to a single facet? How can you put a different label in each facet? (Hint: Think about the dataset that is being passed to geom_text().)

label_for_all <- tibble(
  x = Inf,
  y = Inf,
  label = "text for all",
  vjust = "inward",
  hjust = "inward",
  angle = 0,
)

mpg |> ggplot(aes(displ, hwy)) +
  geom_point() +
  facet_wrap(~class) +
  geom_text(data = label_for_all, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))

Label for a single facet

label_for_minivan <- tibble(
  x = Inf,
  y = Inf,
  label = "text for minivan",
  vjust = "inward",
  hjust = "inward",
  angle = 0,
  class = "minivan"
)

mpg |> ggplot(aes(displ, hwy)) +
  geom_point() +
  facet_wrap(~class) +
  geom_text(data = label_for_minivan, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))

Labels for different facets

# different facets
label_different <- mpg |>
  group_by(class) |>
  summarise(mean_mpg = round(mean(hwy), 1)) |>
  mutate(
    x = Inf,
    y = Inf,
    label = paste(class, "average highway mpg:", mean_mpg),
    vjust = "inward",
    hjust = "inward",
    angle = 0
  )

mpg |> ggplot(aes(displ, hwy)) +
  geom_point() +
  facet_wrap(~class) +
  geom_text(data = label_different, aes(label = label, x = x, y = y, vjust = vjust, hjust = hjust, angle = angle))

Playing with hjust and vjust

td <- expand.grid(
  hjust = c(0, 0.5, 1),
  vjust = c(0, 0.5, 1),
  angle = c(0, 45, 90),
  text = "text"
)

ggplot(td, aes(x = hjust, y = vjust)) +
  geom_point() +
  geom_text(aes(label = text, angle = angle, hjust = hjust, vjust = vjust)) +
  facet_grid(~angle) +
  scale_x_continuous(breaks = c(0, 0.5, 1), expand = c(0, 0.2)) +
  scale_y_continuous(breaks = c(0, 0.5, 1), expand = c(0, 0.2))

Corner Labels

corner_labels <- tibble(
  x = c(-Inf, -Inf, Inf, Inf),
  y = c(-Inf, Inf, -Inf, Inf),
  label = c(
    "(x0,y0)", "(x0,y1)",
    "(x1,y0)", "(x1,y1)"
  ),
  vjust = c(0, 0, 1, 1),
  hjust = c(0, 1, 0, 1)
)

annoation_plot <- mpg |> ggplot(aes(x = displ, y = hwy)) +
  geom_point() +
  geom_text(data = corner_labels, aes(
    x = x, y = y, hjust = hjust, vjust = vjust, label = label
  ))
annoation_plot

What arguments to geom_label() control the appearance of the background box?

mpg |> ggplot(aes(x = displ, y = hwy)) +
  geom_point() +
  geom_label(data = data.frame(), label.padding = unit(0.55, "lines"), label.size = 1, color = "blue", size = 10, aes(
    x = Inf, y = Inf, hjust = "inward", vjust = "inward", label = "Hello!"
  ))

What are the four arguments to arrow()? How do they work? Create a series of plots that demonstrate the most important options.

mpg |> ggplot(aes(x = displ, y = cty)) +
  geom_point() +
  annotate(
    geom = "segment",
    x = 3, y = 35, xend = 5, yend = 25, color = "red",
    arrow = arrow(type = "open", angle = 45, ends = "last")
  ) +
  annotate(
    geom = "segment",
    x = 2, y = 35, xend = 4, yend = 25, color = "blue",
    arrow = arrow(type = "closed", angle = 35, ends = "first", length = unit(1, "cm"))
  ) +
  annotate(
    geom = "segment",
    x = 2, y = 35, xend = 4, yend = 15, color = "green",
    arrow = arrow(type = "closed", angle = 35, ends = "last")
  )

Scales

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  scale_x_continuous() +
  scale_y_continuous() +
  scale_color_discrete()

Axis ticks and legend keys

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  scale_y_continuous(breaks = seq(15, 40, by = 5)) 

Removing Ticks

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  scale_x_continuous(labels = NULL) +
  scale_y_continuous(labels = NULL) +
  scale_color_discrete(labels = c("4" = "4-wheel", "f" = "front", "r" = "rear")) # for legend

ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
  geom_point() +
  scale_x_continuous(labels = NULL) +
  scale_y_continuous(labels = NULL) 

Dollars in the scale

library(scales)
ggplot(diamonds, aes(x = price, y = cut)) +
  geom_boxplot(alpha = 0.05) +
  scale_x_continuous(labels = label_dollar()) # uses scales package

More custom dollars (1k, 7k, etc.)

ggplot(diamonds, aes(x = price, y = cut)) +
  geom_boxplot(alpha = 0.05) +
  scale_x_continuous(
    labels = label_dollar(scale = 1/1000, suffix = "K"), 
    breaks = seq(1000, 19000, by = 6000)
  )

Label percente for bar charts

ggplot(diamonds, aes(x = cut, fill = clarity)) +
  geom_bar(position = "fill") +
  scale_y_continuous(name = "Percentage", labels = label_percent())

Breaks for only a few data points

presidential |>
  mutate(id = 33 + row_number()) |>
  ggplot(aes(x = start, y = id)) +
  geom_point() +
  geom_segment(aes(xend = end, yend = id)) +
  scale_x_date(name = NULL, breaks = presidential$start, date_labels = "'%y")

Position of the legend

base <- ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class))

base + theme(legend.position = "right") # the default

base + theme(legend.position = "left")

base + 
  theme(legend.position = "top") +
  guides(color = guide_legend(nrow = 3))

base + 
  theme(legend.position = "bottom") +
  guides(color = guide_legend(nrow = 2)) 

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE) +
  theme(legend.position = "bottom") +
  guides(color = guide_legend(nrow = 2, override.aes = list(size = 6))) # larger legend symbols
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

11.4.4 Replacing a scale

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_bin2d()

# better to log transform
ggplot(diamonds, aes(x = log10(carat), y = log10(price))) +
  geom_bin2d()

adjust scale to log-transform

ggplot(diamonds, aes(x = carat, y = price)) +
  geom_bin2d() + 
  scale_x_log10() + 
  scale_y_log10()

change color scale

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = drv)) +
  scale_color_brewer(palette = "Set1")

custom colors scale (blue democrat, red = republican)

presidential |>
  mutate(id = 33 + row_number()) |>
  ggplot(aes(x = start, y = id, color = party)) +
  geom_point() +
  geom_segment(aes(xend = end, yend = id)) +
  scale_color_manual(values = c(Republican = "#E81B23", Democratic = "#00AEF3"))

dealing with color blindness

df <- tibble(
  x = rnorm(10000),
  y = rnorm(10000)
)

ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed() +
  labs(title = "Default, continuous", x = NULL, y = NULL)

ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed() +
  scale_fill_viridis_c() +
  labs(title = "Viridis, continuous", x = NULL, y = NULL)

ggplot(df, aes(x, y)) +
  geom_hex() +
  coord_fixed() +
  scale_fill_viridis_b() +
  labs(title = "Viridis, binned", x = NULL, y = NULL)

11.4.5 Zooming

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = drv)) +
  geom_smooth() +
  scale_x_continuous(limits = c(5, 6)) +
  scale_y_continuous(limits = c(10, 25))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 202 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 202 rows containing missing values or values outside the scale range
## (`geom_point()`).

Using coord_cartesian to set limits

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(color = drv)) +
  geom_smooth() +
  coord_cartesian(xlim = c(5, 6), ylim = c(10, 25))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Random packages (janitor, skimr)

library(ryouwithme)
# exploring
skim(mpg)
Data summary
Name mpg
Number of rows 234
Number of columns 11
_______________________
Column type frequency:
character 6
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
manufacturer 0 1 4 10 0 15 0
model 0 1 2 22 0 38 0
trans 0 1 8 10 0 10 0
drv 0 1 1 1 0 3 0
fl 0 1 1 1 0 5 0
class 0 1 3 10 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
displ 0 1 3.47 1.29 1.6 2.4 3.3 4.6 7 ▇▆▆▃▁
year 0 1 2003.50 4.51 1999.0 1999.0 2003.5 2008.0 2008 ▇▁▁▁▇
cyl 0 1 5.89 1.61 4.0 4.0 6.0 8.0 8 ▇▁▇▁▇
cty 0 1 16.86 4.26 9.0 14.0 17.0 19.0 35 ▆▇▃▁▁
hwy 0 1 23.44 5.95 12.0 18.0 24.0 27.0 44 ▅▅▇▁▁
mpg |>
  tabyl(year, manufacturer)
##  year audi chevrolet dodge ford honda hyundai jeep land rover lincoln mercury
##  1999    9         7    16   15     5       6    2          2       2       2
##  2008    9        12    21   10     4       8    6          2       1       2
##  nissan pontiac subaru toyota volkswagen
##       6       3      6     20         16
##       7       2      8     14         11
# cleaning
beaches <- sydneybeaches
beaches <- clean_names(beaches) # all lowercase with underscores
glimpse(beaches)
## Rows: 3,690
## Columns: 8
## $ beach_id              <dbl> 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, …
## $ region                <chr> "Sydney City Ocean Beaches", "Sydney City Ocean …
## $ council               <chr> "Randwick Council", "Randwick Council", "Randwic…
## $ site                  <chr> "Clovelly Beach", "Clovelly Beach", "Clovelly Be…
## $ longitude             <dbl> 151.2675, 151.2675, 151.2675, 151.2675, 151.2675…
## $ latitude              <dbl> -33.91449, -33.91449, -33.91449, -33.91449, -33.…
## $ date                  <chr> "02/01/2013", "06/01/2013", "12/01/2013", "18/01…
## $ enterococci_cfu_100ml <dbl> 19, 3, 2, 13, 8, 7, 11, 97, 3, 0, 6, 0, 1, 8, 3,…
beaches <- beaches |>
  rename(
    beachbugs = enterococci_cfu_100ml
  )

Most beach bugs

worst_day_coogee <- beaches |>
  arrange(desc(beachbugs)) |>
  filter(site == "Coogee Beach") |>
  head(1)

worst_day_little_bay <- beaches |>
  arrange(desc(beachbugs)) |>
  filter(site == "Little Bay Beach") |>
  head(1)

worst_day_coogee$beachbugs # 1200
## [1] 1200
worst_day_little_bay$beachbugs # 4900
## [1] 4900

Question B: Does Coogee or Bondi have more extreme bacteria levels? Which beach has the worst bacteria levels on average?

bugs_beach <- beaches %>%
  group_by(site) %>%
  summarise(avg_bug = mean(beachbugs, na.rm = TRUE)) |>
  arrange(desc(avg_bug))

bugs_beach # Malabar has worst on average
## # A tibble: 11 × 2
##    site                    avg_bug
##    <chr>                     <dbl>
##  1 Malabar Beach              68.1
##  2 South Maroubra Rockpool    63.9
##  3 Little Bay Beach           45.6
##  4 Coogee Beach               39.4
##  5 Tamarama Beach             35.7
##  6 Bronte Beach               31.4
##  7 Gordons Bay (East)         24.9
##  8 Maroubra Beach             20.2
##  9 Bondi Beach                18.8
## 10 South Maroubra Beach       15.7
## 11 Clovelly Beach             10.2
bugs_beach %>%
  filter(str_detect(site, "Coogee|Bondi")) # Cooggee is worse
## # A tibble: 2 × 2
##   site         avg_bug
##   <chr>          <dbl>
## 1 Coogee Beach    39.4
## 2 Bondi Beach     18.8

Total beachbugs observations per site

beaches |>
  group_by(site) |>
  summarise(tl_bug = sum(beachbugs, na.rm = T)) |>
  arrange(desc(tl_bug))
## # A tibble: 11 × 2
##    site                    tl_bug
##    <chr>                    <dbl>
##  1 Malabar Beach            23227
##  2 South Maroubra Rockpool  20064
##  3 Little Bay Beach         15325
##  4 Coogee Beach             13349
##  5 Tamarama Beach           11969
##  6 Bronte Beach             10526
##  7 Gordons Bay (East)        8018
##  8 Maroubra Beach            6760
##  9 Bondi Beach               6271
## 10 South Maroubra Beach      5277
## 11 Clovelly Beach            3413

Custom Theme

theme_jen <- function() {
  # define font up front
  font <- "Helvetica"
  # this theme uses theme_bw as the base

  theme_bw() %+replace%
    theme(
      # get rid of grid lines/borders
      panel.border = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      # add white space top, right, bottom, left
      plot.margin = unit(c(1, 1, 1, 1), "cm"),
      # custom axis title/text/lines
      axis.title = element_text(
        family = font,
        size = 14
      ),
      axis.text = element_text(
        family = font,
        size = 12
      ),
      # margin pulls text away from axis
      axis.text.x = element_text(
        margin = margin(5, b = 10)
      ),
      # black lines
      axis.line = element_line(colour = "black", size = rel(1)),
      # custom plot titles, subtitles, captions
      plot.title = element_text(
        family = font,
        size = 18,
        hjust = -0.1,
        vjust = 4
      ),
      plot.subtitle = element_text(
        family = font,
        size = 14,
        hjust = 0,
        vjust = 3
      ),
      plot.caption = element_text(
        family = font,
        size = 10,
        hjust = 1,
        vjust = 2
      ),
      # custom legend
      legend.title = element_text(
        family = font,
        size = 10,
        hjust = 0
      ),
      legend.text = element_text(
        family = font,
        size = 8,
        hjust = 0
      ),
      # no background on legend
      legend.key = element_blank(),
      # white background on plot
      strip.background = element_rect(
        fill = "white",
        colour = "black", size = rel(2)
      ), complete = TRUE
    )
}

GGEasy

tt <- tt_load("2021-01-05")
## --- Compiling #TidyTuesday Information for 2021-01-05 ----
## --- There is 1 file available ---
## --- Starting Download ---
## 
##  Downloading file 1 of 1: `transit_cost.csv`
## --- Download complete ---
cost <- tt$transit_cost

easy_expand_y_axis <- function() {
  scale_y_continuous(expand = c(0, 0))
}

cost %>%
  group_by(country) %>%
  summarise(meancost = mean(cost_km_millions)) %>%
  arrange(-meancost) %>%
  head(5) %>%
  ggplot(aes(x = reorder(country, meancost), y = meancost, fill = country)) +
  geom_col() +
  labs(
    y = "Average cost per km (million)", x = "Country",
    title = "Countries with the most expensive transit projects",
    caption = "why is the US so $$$$?"
  ) +
  theme_jen() +
  easy_remove_legend() +
  easy_expand_y_axis() +
  ggdark::dark_theme_dark()
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().